This tutorial covers the basic population genetics analyses for a set of populations (or closely related species)
1) Evaluating, filtering and plotting genetic data
2) Test for Hardy-Weinberg-Equilibrium (HWE) and linkage between loci
3) Calculating, assessing and plotting genotypic richness and diversity, inbreeding and heterozygosity
4) Calculating and plotting genetic differentiation (Fst, Gst and Josts D)
5) Calculating and plotting isolation-by-distance (IBD)
6) Performing and plotting principal component analyses (PCA)
7) Performing and plotting discriminant analyses of principal components (DAPC) with cross-validation
8) Creating Structure-like plots and maps based on DAPC results

As input is required (see Import data paragraph below)
1) a vcf file with the genetic SNP data
2) a dataset containing individual IDs, longitude, latitude and population/species assignment for each sample/individual (it is recommended to name individuals in SNP data exactly like IDs in dataset to facilitate any filtering/subsetting data)

The tutorial uses an example dataset of unpublished ddRAD data (121 individiduals of 11 taxa) from the Hemileuca maia buck moth species group (Saturniidae: Lepidoptera) distributed throughout the US and southern Canada

1) Evaluate data

Install and load required packages
## Ensure required installers are present
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")

## Install and load all packages
install_if_missing <- function(pkg) { 
  if (!require(pkg, character.only = T)) {
    install.packages(pkg, dependencies = T)
    library(pkg, character.only = T)}}  
install_if_missing("adegenet")
install_if_missing("conStruct")
install_if_missing("dplyr")
install_if_missing("geosphere")
install_if_missing("ggplot2")
install_if_missing("hierfstat")
install_if_missing("mmod")
install_if_missing("pegas")
install_if_missing("plotly")
install_if_missing("poppr")
install_if_missing("reshape2")
install_if_missing("StAMPP")
install_if_missing("stringr")
install_if_missing("vcfR")
install_if_missing("viridis")

## Install SNPRelate from Bioconductor
if (!requireNamespace("SNPRelate", quietly = TRUE)) {BiocManager::install("SNPRelate")}
library(SNPRelate)
Import data

Ignore any warnings that may come up (not shown in output here)

rm(list = ls()) #clear environment

setwd("C:/Users/danie/Desktop/PhD research/Tutorials/Population_genetics_tutorial")
#set working directory (containing vcf and datafile)
dataset <- read.table(file = "Hmg_data.txt", sep = "\t", header = T, stringsAsFactors = T) #import dataset containing columns: ID, Species, longitude, Latitude
genpop_data <- vcfR2genind(read.vcfR("Hmg_pop.vcf")) #import vcf file and convert to genind object
Remove outgroup (since it is not necessary for any analyses here)
outgroup <- c("CA_OroGrandeWash_Hmg086", "CA_OroGrandeWash_Hmg087")
genpop_data <- genpop_data[!indNames(genpop_data) %in% outgroup, ]
dataset <- dataset[!dataset$ID%in% outgroup, ]
Set population and individual IDs
population_assignment <- dataset$Species #set population assignment
population_assignment <- factor(population_assignment, levels = unique(population_assignment))
genpop_data$pop <- droplevels(as.factor(population_assignment))
droplevels(as.factor(population_assignment))
Evaluate data
head(dataset) #check dataset
##                         ID         Species Latitude Longitude
## 1     NH_CheshireCo_Hmg129        H.lucina 42.92000 -71.22000
## 2     NH_CheshireCo_Hmg130        H.lucina 42.92000 -71.22000
## 3     NH_CheshireCo_Hmg308        H.lucina 42.92000 -71.22000
## 4       MA_Montague_Hmg314 H.maia.x.lucina 42.55740 -72.51770
## 5     IA_LoessHills_Hmg315    H.latifascia 41.92489 -95.99636
## 6 AL_DunavantValley_Hmg344          H.maia 33.44000 -86.61000
table(genpop_data$pop) #show taxa and number of samples per taxon
## 
##               H.lucina        H.maia.x.lucina           H.latifascia 
##                      9                      1                     11 
##                 H.maia H.sp.GreatLakesComplex           H.nevadensis 
##                     47                     10                     16 
##             H.slosseri             H.peigleri 
##                     17                     10
nLoc(genpop_data) #show number of loci
## [1] 5173
unique(genpop_data@pop) #show all species (or populations )
## [1] H.lucina               H.maia.x.lucina        H.latifascia          
## [4] H.maia                 H.sp.GreatLakesComplex H.nevadensis          
## [7] H.slosseri             H.peigleri            
## 8 Levels: H.lucina H.maia.x.lucina H.latifascia ... H.peigleri
poppr(genpop_data) #show basic summary table, including number of individuals (N) and genotypes (MLG) per population, Shannon-Wiener index (H; Shannon 2001, genotype diversity), Stoddart and Taylor's (1988) G index (G, genotype diversity) and Nei’s unbiased gene diversity (Nei 1978; Hexp)
##                      Pop   N MLG eMLG       SE    H   G lambda E.5 Hexp   Ia
## 1               H.lucina   9   9    9 0.00e+00 2.20   9  0.889   1    0 60.4
## 2        H.maia.x.lucina   1   1    1 0.00e+00 0.00   1  0.000 NaN    0   NA
## 3           H.latifascia  11  11   10 0.00e+00 2.40  11  0.909   1    0 16.6
## 4                 H.maia  47  47   10 0.00e+00 3.85  47  0.979   1    0 26.7
## 5 H.sp.GreatLakesComplex  10  10   10 0.00e+00 2.30  10  0.900   1    0 36.8
## 6           H.nevadensis  16  16   10 0.00e+00 2.77  16  0.938   1    0 36.9
## 7             H.slosseri  17  17   10 2.31e-07 2.83  17  0.941   1    0 44.0
## 8             H.peigleri  10  10   10 0.00e+00 2.30  10  0.900   1    0 29.2
## 9                  Total 121 121   10 0.00e+00 4.80 121  0.992   1    0 28.8
##     rbarD        File
## 1 0.03413 genpop_data
## 2      NA genpop_data
## 3 0.00692 genpop_data
## 4 0.00758 genpop_data
## 5 0.01671 genpop_data
## 6 0.01486 genpop_data
## 7 0.01669 genpop_data
## 8 0.01454 genpop_data
## 9 0.00736 genpop_data
sort(private_alleles(genpop_data) %>% apply(MARGIN = 1, FUN = sum), decreasing = T) #show number of private alleles (alleles found exclusively in one population) per site across all loci for each population
##                 H.maia               H.lucina H.sp.GreatLakesComplex 
##                    580                    109                     64 
##           H.nevadensis             H.slosseri           H.latifascia 
##                     63                     52                     32 
##             H.peigleri        H.maia.x.lucina 
##                      2                      0
Filter data
missing_data <- poppr::info_table(genpop_data, type = "missing", plot = T) #plot missing data across loci and populations

round(missing_data[1:length(unique(genpop_data$pop)), 1:4], 2) #show proportion of missing data across populations for first four loci
##                         Locus
## Population               1337:7:+ 1344:7:- 1538:93:+ 1604:40:-
##   H.lucina                   0.11     0.56      0.33      0.00
##   H.maia.x.lucina            1.00     1.00      1.00      0.00
##   H.latifascia               0.00     0.00      0.27      0.09
##   H.maia                     0.06     0.09      0.13      0.04
##   H.sp.GreatLakesComplex     0.10     0.10      0.30      0.00
##   H.nevadensis               0.00     0.00      0.06      0.19
##   H.slosseri                 0.12     0.06      0.12      0.06
##   H.peigleri                 0.10     0.00      0.10      0.00
genpop_data <- poppr::informloci(genpop_data, 
                                 MAF = 0.01, #only retain loci with at least 1% of the minor allele (minor allele frequency cutoff to 5%)
                                 cutoff = 2 / nInd(genpop_data)) #retain only loci with at least (2 / total number of individuals) differentiating genotypes are retained (a locus must have at least (2 / total number of individuals) of its genotypes be different from each other to be considered useful and being retained to ensure that only loci with enough variation are kept)
## cutoff value: 1.65289256198347 % ( 2 samples ).
## MAF         : 0.01
## 
##  Found 544 uninformative loci 
##  ============================ 
##  60 loci found with a cutoff of 2 samples :
##  234597:20:+, 249309:9:+, 267729:15:+, 330753:6:-, 390364:6:+,
## 395275:11:-, 430560:27:+, 479991:37:-, 503404:39:+, 551262:11:-,
## 609753:42:+, 610035:10:+, 759254:12:-, 776354:11:+, 777079:11:-,
## 820465:37:+, 898922:27:-, 977991:13:-, 1055307:9:-, 1056600:19:+,
## 1080379:59:-, 1097298:4:-, 1098638:7:+, 1199902:12:-, 1210181:49:-,
## 1357125:30:-, 1382467:21:+, 1495896:20:-, 1532928:6:-, 1596396:10:+,
## 1643540:5:-, 1705783:44:+, 1717386:21:+, 1759208:13:-, 1761264:4:+,
## 1762811:10:+, 1828445:6:-, 1829645:27:+, 1948175:7:+, 1982183:6:-,
## 2013013:6:-, 2218908:8:-, 2307601:6:+, 2376350:51:+, 2403159:24:-,
## 2409847:58:-, 2412337:25:-, 2527720:22:-, 2616050:12:-, 2721002:10:+,
## 2759681:9:-, 2911224:34:+, 2977139:18:+, 2993628:53:+, 2995958:4:+,
## 3215762:8:-, 3279858:6:-, 3310354:18:+, 3335345:24:+, 3365183:30:- 
##  544 loci found with MAF < 0.01 :
##  1337:7:+, 12488:9:+, 22620:9:+, 24788:6:-, 38194:4:-, 42958:8:+,
## 48723:6:-, 65637:19:-, 72032:14:-, 78058:4:+, 84123:35:+, 96838:54:+,
## 121454:31:+, 125541:14:+, 128764:4:-, 134084:4:+, 139964:4:-,
## 140704:26:-, 154460:8:-, 156841:9:-, 172343:14:+, 172469:9:+,
## 175073:9:-, 178277:9:-, 178522:42:-, 184552:18:-, 188755:40:-,
## 196503:21:-, 197938:14:-, 206265:15:-, 231381:16:-, 234597:20:+,
## 235868:8:-, 235915:19:-, 239754:16:-, 249309:9:+, 250369:40:-,
## 255968:12:+, 257692:7:+, 258856:58:-, 259039:15:-, 263627:14:-,
## 265997:4:-, 267729:15:+, 270708:18:+, 272021:12:+, 275577:7:+,
## 278694:48:-, 285495:8:+, 288977:12:-, 297850:9:-, 300051:13:+,
## 307195:13:-, 313001:22:-, 315842:7:-, 330753:6:-, 337482:12:-,
## 341860:15:+, 345195:35:-, 345771:22:+, 356223:7:+, 375697:20:-,
## 390255:6:+, 390364:6:+, 390926:44:+, 395147:48:+, 395275:11:-,
## 405205:44:-, 410405:6:-, 414448:8:-, 419988:25:-, 428684:12:+,
## 430138:15:+, 430560:27:+, 434006:5:+, 436347:21:-, 446766:9:-,
## 448971:4:+, 450854:7:+, 457849:15:-, 461761:11:-, 473135:48:+,
## 473583:8:-, 474080:7:+, 474268:7:-, 479991:37:-, 482412:21:-,
## 491595:21:+, 499882:27:-, 503404:39:+, 506303:93:-, 515279:6:+,
## 535313:5:+, 536630:59:+, 537190:6:+, 541578:30:-, 543838:18:+,
## 548319:16:-, 549402:12:-, 550483:10:-, 550991:10:-, 551262:11:-,
## 554075:11:-, 559828:5:-, 563652:19:+, 566546:8:-, 580261:7:-,
## 604546:9:-, 609753:42:+, 610035:10:+, 614784:8:+, 620834:41:+,
## 625699:49:+, 628118:6:+, 643765:7:-, 649863:35:-, 650712:11:+,
## 656762:10:-, 659001:6:+, 662018:44:-, 669966:15:+, 672261:15:+,
## 679020:33:-, 681542:21:-, 682917:24:+, 687532:4:-, 692659:16:-,
## 696336:10:-, 697912:15:+, 703241:21:-, 707153:40:+, 710910:5:+,
## 718791:5:+, 723997:11:-, 726565:5:+, 727851:36:-, 729657:8:+,
## 732548:39:-, 736870:17:+, 747740:10:-, 750929:17:-, 756107:12:-,
## 759254:12:-, 760998:11:+, 766840:4:+, 772302:13:-, 773928:7:-,
## 774117:73:+, 775659:9:+, 776354:11:+, 777079:11:-, 777443:12:+,
## 778270:36:-, 789617:48:+, 790128:41:+, 811746:5:-, 819100:11:+,
## 819327:17:-, 820465:37:+, 825100:21:-, 831670:6:-, 841628:17:+,
## 867057:6:+, 867132:5:+, 871522:4:+, 880671:43:+, 882527:5:-,
## 898922:27:-, 917205:13:-, 929975:9:-, 959746:36:-, 966142:12:+,
## 977991:13:-, 979187:20:+, 980706:5:-, 994801:9:-, 1001873:30:+,
## 1030276:10:+, 1050635:16:+, 1055307:9:-, 1056117:7:-, 1056467:35:+,
## 1056600:19:+, 1058512:6:-, 1061001:20:-, 1063374:7:+, 1068068:23:+,
## 1068574:24:-, 1075834:12:-, 1080379:59:-, 1084472:9:+, 1086987:13:+,
## 1097298:4:-, 1098638:7:+, 1107901:7:+, 1111519:21:+, 1119478:5:+,
## 1122674:13:-, 1122730:9:-, 1124908:26:-, 1129053:12:+, 1130922:4:+,
## 1131296:66:+, 1136980:93:+, 1142371:4:-, 1172723:6:-, 1175445:10:-,
## 1178051:16:+, 1188064:28:+, 1192965:10:+, 1195684:9:+, 1199902:12:-,
## 1208771:31:+, 1209362:13:+, 1210181:49:-, 1211497:5:+, 1212711:8:-,
## 1228787:13:-, 1250628:26:-, 1253149:20:-, 1253833:15:-, 1272192:9:-,
## 1283874:6:-, 1285858:14:-, 1286143:72:+, 1300043:7:-, 1301567:18:-,
## 1303636:11:-, 1306128:46:-, 1314963:53:-, 1317589:9:+, 1323595:4:-,
## 1325792:42:+, 1338850:14:+, 1339097:4:+, 1347312:10:-, 1347412:84:+,
## 1348720:4:+, 1351091:8:-, 1357125:30:-, 1360182:15:+, 1382467:21:+,
## 1411773:36:+, 1419937:8:-, 1421558:4:-, 1437981:15:-, 1446627:10:-,
## 1449784:9:-, 1466829:12:-, 1476625:9:-, 1485738:44:-, 1495896:20:-,
## 1508415:52:-, 1509075:12:+, 1532928:6:-, 1552355:9:+, 1556155:5:-,
## 1562109:7:-, 1571857:13:-, 1571956:15:-, 1577844:22:-, 1581607:16:-,
## 1593374:47:+, 1596396:10:+, 1604169:51:-, 1615444:6:+, 1632203:4:+,
## 1643540:5:-, 1651269:6:+, 1655211:18:-, 1658500:7:+, 1660299:9:+,
## 1672751:13:-, 1682905:14:+, 1684017:4:-, 1699564:11:-, 1703312:4:+,
## 1705783:44:+, 1717386:21:+, 1718964:40:+, 1721110:14:-, 1726628:5:+,
## 1727756:47:-, 1732000:6:-, 1736572:9:+, 1744321:27:-, 1753241:12:+,
## 1755573:4:+, 1759208:13:-, 1760087:15:-, 1760881:26:+, 1761264:4:+,
## 1762486:23:+, 1762811:10:+, 1764247:40:+, 1776337:4:+, 1781801:12:-,
## 1787267:16:+, 1795737:13:+, 1819292:7:-, 1819932:12:-, 1823783:20:+,
## 1825241:27:+, 1827040:7:-, 1828445:6:-, 1829645:27:+, 1831438:20:+,
## 1832020:36:-, 1842264:5:-, 1843830:20:-, 1844907:9:-, 1857098:14:-,
## 1863976:5:+, 1868751:5:-, 1872817:24:-, 1887103:33:+, 1887223:36:-,
## 1900993:19:-, 1915300:6:-, 1927473:16:+, 1934290:7:-, 1947652:22:+,
## 1947928:12:+, 1947957:22:-, 1948175:7:+, 1948942:19:+, 1951103:10:-,
## 1953940:17:-, 1971803:21:+, 1982183:6:-, 1988081:16:+, 1988241:7:-,
## 2004350:4:+, 2013013:6:-, 2014900:9:-, 2018145:41:+, 2025219:13:-,
## 2028256:15:+, 2057063:27:-, 2061137:26:-, 2062365:5:-, 2068328:8:+,
## 2069780:6:+, 2072688:10:+, 2077659:16:-, 2079198:23:-, 2080321:9:-,
## 2092814:9:-, 2110545:18:+, 2130427:20:-, 2132717:11:+, 2136215:8:-,
## 2138234:4:-, 2144223:20:+, 2157187:6:+, 2160112:18:-, 2169081:24:+,
## 2171959:15:-, 2190514:15:+, 2191387:5:-, 2193033:87:-, 2196461:7:-,
## 2200550:10:+, 2204766:5:+, 2218908:8:-, 2240061:11:+, 2268956:4:+,
## 2271152:48:+, 2277116:4:+, 2277199:15:-, 2277339:14:-, 2284766:49:-,
## 2285075:16:+, 2292014:33:+, 2301146:9:-, 2303391:12:-, 2307601:6:+,
## 2317597:14:+, 2339974:16:+, 2373256:8:+, 2375452:9:+, 2376350:51:+,
## 2377620:27:+, 2380500:17:+, 2389904:20:+, 2398477:23:-, 2403159:24:-,
## 2403355:27:-, 2406620:19:-, 2409847:58:-, 2410868:78:+, 2412337:25:-,
## 2415230:6:-, 2417676:7:+, 2419051:4:-, 2432487:31:-, 2433460:9:-,
## 2437701:6:-, 2439649:12:+, 2443858:5:+, 2465652:10:+, 2480277:21:+,
## 2487006:4:-, 2490212:26:-, 2496499:19:-, 2502726:24:-, 2503228:5:+,
## 2527700:6:-, 2527720:22:-, 2533874:36:-, 2547003:28:-, 2548127:23:-,
## 2559298:24:-, 2569535:11:+, 2581436:8:+, 2610951:6:-, 2614341:4:-,
## 2616050:12:-, 2617481:4:-, 2626826:6:+, 2627276:64:+, 2628664:36:-,
## 2630869:21:-, 2635134:37:-, 2638443:12:-, 2658914:6:+, 2662921:7:+,
## 2663413:25:-, 2680567:33:-, 2696685:16:+, 2700324:16:-, 2715898:20:+,
## 2719634:17:-, 2721002:10:+, 2726124:6:-, 2727000:9:-, 2735386:5:-,
## 2737033:6:+, 2741529:16:+, 2745477:47:-, 2758368:8:-, 2758487:28:-,
## 2759405:34:+, 2759681:9:-, 2766828:49:+, 2766963:59:-, 2768701:6:+,
## 2771383:15:-, 2786644:67:+, 2798593:10:-, 2806142:18:-, 2807313:6:-,
## 2817882:12:-, 2819054:4:+, 2819264:11:+, 2828486:4:+, 2839560:11:+,
## 2839703:76:-, 2841956:19:-, 2857625:34:-, 2857766:12:-, 2857863:7:-,
## 2863789:4:+, 2869983:9:+, 2871028:21:-, 2877535:9:+, 2878330:9:-,
## 2878501:12:+, 2890373:43:-, 2892783:12:+, 2903089:34:-, 2911224:34:+,
## 2911405:7:-, 2935032:8:-, 2935319:19:+, 2948983:13:-, 2967004:42:+,
## 2977139:18:+, 2978975:6:-, 2985944:11:-, 2986306:25:+, 2992673:17:-,
## 2993271:10:-, 2993353:16:+, 2993628:53:+, 2994999:9:+, 2995958:4:+,
## 2997169:5:+, 3018948:37:-, 3044925:41:-, 3045769:22:+, 3056024:62:-,
## 3091935:17:+, 3092204:14:+, 3112923:9:-, 3133732:23:-, 3144698:4:+,
## 3146587:5:+, 3147248:6:-, 3147551:4:+, 3148770:15:+, 3150676:30:-,
## 3153472:10:+, 3161794:29:-, 3171268:18:+, 3190905:30:-, 3194174:65:-,
## 3198359:19:+, 3199407:11:+, 3203478:4:-, 3214530:10:-, 3215594:9:+,
## 3215732:11:+, 3215762:8:-, 3222181:6:-, 3243256:37:-, 3253054:4:+,
## 3254816:28:-, 3269046:9:-, 3273472:13:+, 3277244:22:+, 3279858:6:-,
## 3299566:6:-, 3310354:18:+, 3311474:11:-, 3317853:30:-, 3323224:4:+,
## 3332556:4:-, 3332930:33:-, 3333910:39:+, 3335345:24:+, 3336781:9:+,
## 3340376:11:+, 3355302:5:+, 3361086:6:+, 3365183:30:-, 3370882:9:-,
## 3371435:6:+, 3376350:4:+, 3387137:20:-, 3401822:17:+, 3417690:6:+,
## 3422906:10:+, 3422983:8:+
genpop_data <- genpop_data %>% missingno("loci", cutoff = 0.2) #remove loci with average missing data higher than 20%
## 
##  No loci with missing values above 20% found.
genpop_data <- genpop_data %>% missingno("geno", cutoff = 0.3) #remove individuals with more than 30% missing genotypes
## 
## Found 118494 missing values.
## 
## 2 genotypes contained missing values greater than 30%
## 
## Removing 2 genotypes: maia_MA_DN2546, maia_MA_DN2549
dataset <- dataset[match(indNames(genpop_data), dataset$ID), ] #match genetic data and dataset
dataset$Species <- droplevels(dataset$Species) #drop unused factors
genpop_data@pop <- factor(dataset$Species) #reassign population slot of genind object to match updated dataset
Reevaluate data after filtering
poppr(genpop_data, quiet  = T) #show basic summary table, including number of individuals (N) and genotypes (MLG) per population, Shannon-Wiener index (H; genotype diversity), Stoddart and Taylor's G index (G, genotype diversity) and expected heterozygosity (Hexp) per population
##                      Pop   N MLG eMLG       SE    H   G lambda E.5 Hexp   Ia
## 1           H.latifascia  11  11   10 0.00e+00 2.40  11  0.909   1    0 24.5
## 2               H.lucina   9   9    9 0.00e+00 2.20   9  0.889   1    0 34.5
## 3                 H.maia  45  45   10 0.00e+00 3.81  45  0.978   1    0 29.4
## 4        H.maia.x.lucina   1   1    1 0.00e+00 0.00   1  0.000 NaN    0   NA
## 5           H.nevadensis  16  16   10 0.00e+00 2.77  16  0.938   1    0 26.3
## 6             H.peigleri  10  10   10 0.00e+00 2.30  10  0.900   1    0 18.1
## 7             H.slosseri  17  17   10 2.31e-07 2.83  17  0.941   1    0 57.1
## 8 H.sp.GreatLakesComplex  10  10   10 0.00e+00 2.30  10  0.900   1    0 24.9
## 9                  Total 119 119   10 0.00e+00 4.78 119  0.992   1    0 27.7
##     rbarD        File
## 1 0.01245 genpop_data
## 2 0.02325 genpop_data
## 3 0.00960 genpop_data
## 4      NA genpop_data
## 5 0.05717 genpop_data
## 6 0.00910 genpop_data
## 7 0.02560 genpop_data
## 8 0.01204 genpop_data
## 9 0.00711 genpop_data
missing_data <- poppr::info_table(genpop_data, type = "missing", plot = T) #plot missing data across loci and populations

round(missing_data[1:length(unique(genpop_data$pop)), 1:4], 2) #show proportion of missing data across populations for first four loci
##                         Locus
## Population               1344:7:- 1538:93:+ 1604:40:- 1802:5:+
##   H.latifascia               0.55      0.27      0.00     0.18
##   H.lucina                   0.00      0.11      0.33     0.00
##   H.maia                     0.09      0.13      0.04     0.09
##   H.maia.x.lucina            0.00      1.00      0.00     1.00
##   H.nevadensis               0.06      0.06      0.00     0.06
##   H.peigleri                 0.00      0.10      0.00     0.00
##   H.slosseri                 0.00      0.24      0.06     0.12
##   H.sp.GreatLakesComplex     0.00      0.30      0.00     0.10
nLoc(genpop_data) #show number of loci
## [1] 4629
private_alleles(genpop_data) %>% apply(MARGIN = 1, FUN = sum) #show number of private alleles (alleles found exclusively in one population) per site across all loci for each population
##           H.latifascia               H.lucina                 H.maia 
##                    252                    124                   2327 
##        H.maia.x.lucina           H.nevadensis             H.peigleri 
##                      2                    941                     93 
##             H.slosseri H.sp.GreatLakesComplex 
##                    609                    189
plot(summary(genpop_data)$n.by.pop, summary(genpop_data)$pop.n.all, 
     xlab = "Number of indiduals", ylab = "Number of alleles", 
     main = "Alleles numbers and sample sizes", type = "n")
text(summary(genpop_data)$n.by.pop, summary(genpop_data)$pop.n.all,
     lab = names(summary(genpop_data)$n.by.pop))

barplot(summary(genpop_data)$Hexp - summary(genpop_data)$Hobs, 
        main = "Heterozygosity: expected-observed", ylab = "Hexp - Hobs")

par(mar =  c(13.1, 5.1, 4.1, 2.1)) #adjust plot margins
barplot(summary(genpop_data)$n.by.pop, 
        main = "Sample sizes per population", ylab = "Number of genotypes", las = 3)

tail(sort(round(summary(genpop_data)$Hobs, 2))) #show loci with highest observed heterozygosity
## 3219076:10:-  1714120:9:+   769190:9:+ 1079236:33:- 3003808:16:+   179790:9:- 
##         0.44         0.45         0.46         0.46         0.46         0.47
Assess genotypic richness and diversity for each population

Diversity indices incorporate both genotypic richness and abundance.

round(((poppr(genpop_data))$eMLG), 2) #show genotypic richness accounting for sample size differences (eMLG) via rarefaction
## [1] 10  9 10  1 10 10 10 10 10
round(((poppr(genpop_data))$lambda), 2) #show Simpson's index lambda (Simpson 1949; measure of genotypic diversity: estimation of probability that two randomly selected genotypes are different scaling from 0 with no genotypes are different to 1 so that all genotypes are different)
## [1] 0.91 0.89 0.98 0.00 0.94 0.90 0.94 0.90 0.99
Corr_Simp_ind <- round((((poppr(genpop_data, quiet  = T))$N / ((poppr(genpop_data, quiet  = T))$N - 1)) * (poppr(genpop_data, quiet  = T))$lambda), 2) #calculate sample-size-corrected Simpson's index
data.frame(Population = levels(genpop_data$pop), Value = Corr_Simp_ind[1:length(levels(genpop_data$pop))], stringsAsFactors = F) #print corrected Simpson's index
##               Population Value
## 1           H.latifascia     1
## 2               H.lucina     1
## 3                 H.maia     1
## 4        H.maia.x.lucina   NaN
## 5           H.nevadensis     1
## 6             H.peigleri     1
## 7             H.slosseri     1
## 8 H.sp.GreatLakesComplex     1
Assess evenness for each population

Eveness is a measure of distribution of genotype abundances so that a population with equally abundant genotypes yields value equal to 1 while a population dominated by single genotype is closer to zero.

round(((poppr(genpop_data))$E.5), 2) #show evenness E5 (Pielou 1975, Ludwig & Reynolds 1988, GrĂĽnwald et al. 2003)
## [1]   1   1   1 NaN   1   1   1   1   1

2) Basic population analyses

Test if mean observed heterozygosity is significantly lower than mean expected heterozygosity

A significant p-value indicates lower observed heterozygosity

if ((bartlett.test(list(summary(genpop_data)$Hexp, summary(genpop_data)$Hobs)))$p.value < 0.05) { #test for homogeneity of variances using Bartlett test
  var_equal <- T  #variances are significantly different
} else {var_equal <- F}  #variances are not significantly different
t.test(summary(genpop_data)$Hexp, summary(genpop_data)$Hobs, #perform t-test
       paired = T, var.equal = var_equal, alternative = "greater")
## 
##  Paired t-test
## 
## data:  summary(genpop_data)$Hexp and summary(genpop_data)$Hobs
## t = 50.54, df = 4628, p-value < 2.2e-16
## alternative hypothesis: true mean difference is greater than 0
## 95 percent confidence interval:
##  0.03905451        Inf
## sample estimates:
## mean difference 
##       0.0403686
Test for Hardy-Weinberg-Equilibrium for each population and plot results

Ignore any warnings that may come up (not shown in output here). Export figures with significantly higher width to achieve higher resolution and interpretability (they appear small here due to their large width of figure).

hwe_results <- lapply(seppop(genpop_data), hw.test, B = 0) #ignore any warnings showing up

hwe_pvalues_df <- melt(sapply(hwe_results, "[", i = T, j = 3)) #extract p-values from each population's test results and convert matrix to data frame for ggplot
colnames(hwe_pvalues_df) <- c("Locus", "Population", "P_value")
hwe_pvalues_df$P_value_adj <- p.adjust(hwe_pvalues_df$P_value, method = "fdr") #perform FDR correction
hwe_pvalues_df$Significant <- hwe_pvalues_df$P_value_adj < 0.05 #create significance column based on adjusted p-values
ggplot(hwe_pvalues_df, aes(x = Locus, y = Population, fill = P_value)) + #plot full heatmap of P-values
  geom_tile(color = "white") +
  scale_fill_viridis_c(option = "mako", name = "P-value", direction = -1) +
  labs(x = "Locus", y = "Population", 
       title = "HWE p-values across loci and populations") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_text(size = 8),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        plot.title = element_text(hjust = 0.5),
        plot.margin = margin(0, 0, 0, 0))

ggplot(hwe_pvalues_df, aes(x = Locus, y = Population)) + #plot significant p-values
  geom_tile(aes(fill = Significant), color = "white") + #highlight only significant p-values
  scale_fill_manual(values = c("white", "red"), name = "Significant") + #grey for non-significant, red for significant
  labs(x = "Locus", y = "Population", 
       title = "Significant HWE p-values (FDR corrected)") +
  theme_minimal() +
  theme(axis.text.x = element_blank(),
        axis.text.y = element_text(size = 8),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        plot.title = element_text(hjust = 0.5),
        plot.margin = margin(0, 0, 0, 0))

Calculate and plot observed and expected heterozygosity for each population
calc.heterozygosity <- function(genind_data) { #create function to calculate gene diversity within each population
  if (!inherits(genind_data, "genind")) {
    stop("Input must be a genind object")}
  populations <- levels(pop(genind_data))
  observed_heterozygosity <- numeric(length(populations))
  expected_heterozygosity <- numeric(length(populations))
  for (i in seq_along(populations)) {
    pop <- populations[i]
    pop_data <- genind_data[pop(genind_data) == pop] #subset genind object by population
    if (nInd(pop_data) == 0) {
      warning(paste("No data for population:", pop))
      next}
    tryCatch({
      summary_stats <- summary(pop_data) #check if Hobs and Hexp are present before computing means
      if (!is.null(summary_stats$Hobs)) {
        observed_heterozygosity[i] <- mean(summary_stats$Hobs, na.rm = T)}
      if (!is.null(summary_stats$Hexp)) {
        expected_heterozygosity[i] <- mean(summary_stats$Hexp, na.rm = T)}
    }, error = function(e) {
      warning(paste("Error processing population:", pop, "Details:", e$message))})}
  heterozygosity_df <- data.frame(
    Population = populations,
    Observed = round(observed_heterozygosity, 2),
    Expected = round(expected_heterozygosity, 2))
  print(heterozygosity_df) #print results
  return(heterozygosity_df)} #return heterozygosity data frame
plot_heterozygosity <- function(heterozygosity_df) { #create function for plotting
  heterozygosity_long <- reshape2::melt(heterozygosity_df, id.vars = "Population", 
                                        variable.name = "Type", value.name = "Heterozygosity") #melt data frame for plotting
  ggplot(heterozygosity_long, aes(x = Population, y = Heterozygosity, fill = Type)) + 
    geom_bar(stat = "identity", position = position_dodge(width = 0.8)) + 
    labs(x = "Population", y = "Heterozygosity", fill = "Type") + 
    theme_classic() + 
    scale_fill_manual(values = c("Observed" = "darkblue", "Expected" = "lightblue"))} #set fill colors
heterozygosity_df <- calc.heterozygosity(genpop_data) #calculate heterozygosity
##               Population Observed Expected
## 1           H.latifascia     0.11     0.12
## 2               H.lucina     0.09     0.10
## 3                 H.maia     0.10     0.13
## 4        H.maia.x.lucina     0.10     0.28
## 5           H.nevadensis     0.02     0.04
## 6             H.peigleri     0.12     0.12
## 7             H.slosseri     0.11     0.13
## 8 H.sp.GreatLakesComplex     0.11     0.13
plot_heterozygosity(heterozygosity_df) #plot heterozygosity

3) Assess and test for population structure

F-statistics (Weir & Cockerham 1984)
  • Fst: Measures genetic differentiation among subpopulations (pop/tot)
  • Fis: Measures inbreeding coefficient within subpopulations (ind/pop)
  • Fit: Measures overall inbreeding coefficient by combining both within and among subpopulation inbreeding
genpop_data_loci <- pegas::Fst(pegas::as.loci(genpop_data))
head(round(genpop_data_loci, 2)) #per-locus F-statistics
##             Fit  Fst   Fis
## 1344:7:-   0.41 0.18  0.28
## 1538:93:+  0.62 0.38  0.38
## 1604:40:-  0.61 0.29  0.45
## 1802:5:+   0.00 0.06 -0.06
## 2448:32:- -0.01 0.00 -0.02
## 3144:28:+  0.31 0.26  0.06
round(colMeans(genpop_data_loci[, !is.na(colMeans(genpop_data_loci, na.rm = T)) & !is.nan(colMeans(genpop_data_loci, na.rm = T))], na.rm = T), 2) #global F-statistics (take into account all genotypes and loci but ignore loci with NA)
##  Fit  Fst  Fis 
## 0.27 0.10 0.20
round(boot.vc(genind2hierfstat(genpop_data)[1], genind2hierfstat(genpop_data)[-1], #calculate CIs of F-statistics (H-Total: total expected heterozygosity, F-pop/Total = Fst, F-Ind/Total = Fit, H-pop: expected heterozygosity within populations, F-Ind/pop: Fis, Hobs: observed heterozygosity) ot = 100)$ci, 2) 
              nboot = 300)$ci, 2) #specify number of bootstraps (increase for proper analysis)
##       H-Total F-pop/Total F-Ind/Total H-pop F-Ind/pop Hobs
## 2.5%     0.14        0.15        0.32  0.12      0.20 0.09
## 50%      0.14        0.15        0.33  0.12      0.20 0.09
## 97.5%    0.14        0.16        0.33  0.12      0.21 0.10
Compute and plot measures of genetic differentiation for populations

Create function to compute pairwise genetic differentiation among populations and plot results as heatmap

calc.and.plot.pairwise.gen.dist.mat <- function(pop_dataset, #genind or genlight object with “pop” factor for each individual
                                                genetic_diff_measure = "Fst", #one of "D_Jost", "Gst_Hedrick", "Fst", or "Ds_Nei".
                                                viridis_col = "mako", #name of Viridis palette (e.g., "mako", "viridis", magma etc.)
                                                nboots_Fst = 1000, #number of bootstraps (only for Fst)
                                                star_col_Fst = "#FFA500", #color for significance star in plot (only for Fst)
                                                signif_stars_Fst_plot = F, #add significance stars (for significant p-values after Bonferroni correction) in plot (only for Fst)
                                                min_n_per_pop = 5) { #minimum number of individuals required in each population so that populations with fewer than this threshold will be dropped
  
  ## Check sample sizes
  pop_sizes <- table(adegenet::pop(pop_dataset)) #tabulate number of individuals per population
  small_pops <- names(pop_sizes[pop_sizes < min_n_per_pop]) #identify populations with fewer than min_n_per_pop individuals
  if (length(small_pops) > 0) {
    cat("The following populations have fewer than", min_n_per_pop, "individuals and were removed for analysis:", 
        paste(small_pops, collapse = ", "), "\n")
    pop_dataset <- pop_dataset[!adegenet::pop(pop_dataset) %in% small_pops, ] #drop these populations
  }

  ## Compute genetic distance
  distance_matrix <- switch(genetic_diff_measure,
    
    ## Jost’s D (2008) using mmod::pairwise_D
    "D_Jost" = as.matrix(mmod::pairwise_D(pop_dataset, linearized = FALSE)),
    
    ## Hedrick’s G’ST (2005) using mmod::pairwise_Gst_Hedrick
    "Gst_Hedrick" = as.matrix(mmod::pairwise_Gst_Hedrick(pop_dataset, linearized = FALSE)),
    
    ## Weir & Cockerham’s FST (1984) using StAMPP
    "Fst" = {genind_obj <- StAMPP::stamppConvert(dartR::gi2gl(pop_dataset, verbose = 0), type = "genlight") #convert pop_dataset to genlight and then genind
      fst_results <- StAMPP::stamppFst(genind_obj, nboots = nboots_Fst, percent = 95, nclusters  = 1) #compute FST with bootstrapping for confidence intervals
      list(values = as.matrix(fst_results$Fsts), p_values = as.matrix(fst_results$Pvalues))}, #extract FST matrix and p-values matrix

    ## Nei’s standard genetic distance Ds (1972) using genet.dist
    "Ds_Nei" = as.matrix(genet.dist(pop_dataset, method = "Ds")),
    
    ## Print error if unsupported measure is supplied
    stop("Unsupported genetic differentiation measure. Choose from: ", "'D_Jost', 'Gst_Hedrick', 'Fst', 'Ds_Nei'."))
  
  ## Extract Fst and p-values
  if (genetic_diff_measure == "Fst") {
    fst_values <- distance_matrix$values
    p_values <- distance_matrix$p_values
    cat("p-values for Fst:\n")
    print(p_values)
  } else {
    fst_values <- distance_matrix
    p_values <- NULL
  }
  
  ## Replace negative distances with zero
  fst_values[fst_values < 0] <- 0
  
  ## Prepare matrix for plotting
  
  ## Labels for legend
  label_list <- c("D_Jost" = "Jost’s D", "Gst_Hedrick" = "G’", "Fst" = "F", "Ds_Nei" = "G")
  label <- label_list[[genetic_diff_measure]]
  
  ## For measures that report "G_ST" or "F_ST"
  label_lowercase  <- ifelse(genetic_diff_measure %in% c("Gst_Hedrick", "Fst", "Ds_Nei"), "ST", "")
  
  ## Print labels and Fst values
  cat(paste(label, "values:\n"))
  print(round(fst_values, 2))
  
  ## Create matrix that keeps only lower triangle
  distance_matrix_lower <- fst_values
  distance_matrix_lower[!lower.tri(distance_matrix_lower, diag = FALSE)] <- NA
  
  ## Melt to long format for ggplot
  distance_matrix_long <- reshape2::melt(distance_matrix_lower, na.rm = TRUE)
  colnames(distance_matrix_long) <- c("Row", "Column", "Distance")
  
  ## Add p-values and significance stars for Fst
  if (!is.null(p_values)) {
    
    ## Mask upper triangle of p-values matrix
    p_values_lower <- p_values
    p_values_lower[!lower.tri(distance_matrix_lower, diag = FALSE)] <- NA
    
    ## Melt p-values to long format
    p_values_long <- reshape2::melt(p_values_lower, na.rm = TRUE)
    colnames(p_values_long) <- c("Row", "Column", "P_value")
    
    ## Merge distances with p-values
    distance_matrix_long <- merge(distance_matrix_long, p_values_long, by = c("Row", "Column"))
    
    ## Bonferroni correction: α = 0.05 / number_of_comparisons
    n_comparisons <- nrow(distance_matrix_long)
    bonferroni_threshold <- 0.05 / n_comparisons
    
    ## Significance star if P ≤ threshold
    distance_matrix_long$Significance <- ifelse(distance_matrix_long$P_value <= bonferroni_threshold, "*", "")
  }
  
  ## Plot heatmap
    
  ## Include significance stars on tiles for Fst
  if (genetic_diff_measure == "Fst") {
    if (signif_stars_Fst_plot) {
    plot <- ggplot2::ggplot(distance_matrix_long, aes(x = Column, y = Row, fill = Distance)) +
      geom_tile() +
      geom_text(aes(label = Significance), size  = 7, color = star_col_Fst, na.rm = TRUE) +
      scale_fill_viridis_c(option = viridis_col, direction = -1) +
      theme_classic() +
      labs(x = "Population", y = "Population", fill = bquote(.(label)[.(label_lowercase)]))
    } else {
    plot <- ggplot2::ggplot(distance_matrix_long, aes(x = Column, y = Row, fill = Distance)) +
      geom_tile() +
      scale_fill_viridis_c(option = viridis_col, direction = -1) +
      theme_classic() +
      labs(x = "Population", y = "Population", fill = bquote(.(label)[.(label_lowercase)]))
    }
  } else {
    
    ## Non-FST measures
    plot <- ggplot2::ggplot(distance_matrix_long, aes(x = Column, y = Row, fill = Distance)) +
      geom_tile() +
      scale_fill_viridis_c(option = viridis_col, direction = -1) +
      theme_classic() +
      labs(x = "Population", y = "Population", fill = bquote(.(label)[.(label_lowercase)]))}
  
  ## Print heatmap
  print(plot)
}

Ignore any warnings that may come up (not shown in output here)

calc.and.plot.pairwise.gen.dist.mat(genpop_data, genetic_diff_measure = "Fst", nboots_Fst = 1000, signif_stars_Fst_plot = F) #compute and plot Fst (Weir and Cockerham 1984) with bootstrapping for Bonferroni corrected p-values
## The following populations have fewer than 5 individuals and were removed for analysis: H.maia.x.lucina
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## Registered S3 method overwritten by 'genetics':
##   method      from 
##   [.haplotype pegas
## Starting gl.compliance.check 
##   Processing genlight object with SNP data
##   The slot loc.all, which stores allele name for each locus, is empty. 
## Creating a dummy variable (A/C) to insert in this slot. 
##   Checking coding of SNPs
##     SNP data scored NA, 0, 1 or 2 confirmed
##   Checking locus metrics and flags
##   Recalculating locus metrics
##   Checking for monomorphic loci
##     Dataset contains monomorphic loci
##   Checking for loci with all missing data
##     No loci with all missing data detected
##   Checking whether individual names are unique.
##   Checking for individual metrics
##   Warning: Creating a slot for individual metrics
##   Checking for population assignments
##     Population assignments confirmed
##   Spelling of coordinates checked and changed if necessary to 
##             lat/lon
## Completed: gl.compliance.check 
## p-values for Fst:
##                        H.maia H.latifascia H.sp.GreatLakesComplex H.lucina
## H.maia                     NA           NA                     NA       NA
## H.latifascia                0           NA                     NA       NA
## H.sp.GreatLakesComplex      0            0                     NA       NA
## H.lucina                    0            0                      0       NA
## H.nevadensis                0            0                      0        0
## H.slosseri                  0            0                      0        0
## H.peigleri                  0            0                      0        0
##                        H.nevadensis H.slosseri H.peigleri
## H.maia                           NA         NA         NA
## H.latifascia                     NA         NA         NA
## H.sp.GreatLakesComplex           NA         NA         NA
## H.lucina                         NA         NA         NA
## H.nevadensis                     NA         NA         NA
## H.slosseri                        0         NA         NA
## H.peigleri                        0          0         NA
## F values:
##                        H.maia H.latifascia H.sp.GreatLakesComplex H.lucina
## H.maia                     NA           NA                     NA       NA
## H.latifascia             0.06           NA                     NA       NA
## H.sp.GreatLakesComplex   0.04         0.05                     NA       NA
## H.lucina                 0.13         0.18                   0.14       NA
## H.nevadensis             0.27         0.35                   0.37     0.49
## H.slosseri               0.09         0.09                   0.10     0.21
## H.peigleri               0.08         0.07                   0.08     0.20
##                        H.nevadensis H.slosseri H.peigleri
## H.maia                           NA         NA         NA
## H.latifascia                     NA         NA         NA
## H.sp.GreatLakesComplex           NA         NA         NA
## H.lucina                         NA         NA         NA
## H.nevadensis                     NA         NA         NA
## H.slosseri                     0.32         NA         NA
## H.peigleri                     0.37       0.06         NA

calc.and.plot.pairwise.gen.dist.mat(genpop_data, genetic_diff_measure = "Gst_Hedrick") #compute and plot Hedrick's G'st (Hedrick 2005)
## The following populations have fewer than 5 individuals and were removed for analysis: H.maia.x.lucina
## G’ values:
##                        H.latifascia H.lucina H.maia H.nevadensis H.peigleri
## H.latifascia                   0.00     0.22   0.07         0.36       0.09
## H.lucina                       0.22     0.00   0.17         0.49       0.24
## H.maia                         0.07     0.17   0.00         0.35       0.10
## H.nevadensis                   0.36     0.49   0.35         0.00       0.38
## H.peigleri                     0.09     0.24   0.10         0.38       0.00
## H.slosseri                     0.11     0.25   0.11         0.36       0.07
## H.sp.GreatLakesComplex         0.07     0.18   0.06         0.37       0.10
##                        H.slosseri H.sp.GreatLakesComplex
## H.latifascia                 0.11                   0.07
## H.lucina                     0.25                   0.18
## H.maia                       0.11                   0.06
## H.nevadensis                 0.36                   0.37
## H.peigleri                   0.07                   0.10
## H.slosseri                   0.00                   0.12
## H.sp.GreatLakesComplex       0.12                   0.00

calc.and.plot.pairwise.gen.dist.mat(genpop_data, genetic_diff_measure = "Ds_Nei") #compute and plot Nei's standard genetic distance Ds (Nei 1972)
## The following populations have fewer than 5 individuals and were removed for analysis: H.maia.x.lucina 
## G values:
##                        H.latifascia H.lucina H.maia H.nevadensis H.peigleri
## H.latifascia                   0.00     0.04   0.02         0.05       0.02
## H.lucina                       0.04     0.00   0.03         0.07       0.05
## H.maia                         0.02     0.03   0.00         0.05       0.02
## H.nevadensis                   0.05     0.07   0.05         0.00       0.05
## H.peigleri                     0.02     0.05   0.02         0.05       0.00
## H.slosseri                     0.02     0.05   0.02         0.05       0.02
## H.sp.GreatLakesComplex         0.02     0.03   0.01         0.05       0.02
##                        H.slosseri H.sp.GreatLakesComplex
## H.latifascia                 0.02                   0.02
## H.lucina                     0.05                   0.03
## H.maia                       0.02                   0.01
## H.nevadensis                 0.05                   0.05
## H.peigleri                   0.02                   0.02
## H.slosseri                   0.00                   0.02
## H.sp.GreatLakesComplex       0.02                   0.00

calc.and.plot.pairwise.gen.dist.mat(genpop_data, genetic_diff_measure = "D_Jost") #compute and plot Jost’s D (2008)
## The following populations have fewer than 5 individuals and were removed for analysis: H.maia.x.lucina
## Jost’s D values:
##                        H.latifascia H.lucina H.maia H.nevadensis H.peigleri
## H.latifascia                   0.00     0.03   0.01         0.04       0.01
## H.lucina                       0.03     0.00   0.02         0.07       0.04
## H.maia                         0.01     0.02   0.00         0.04       0.01
## H.nevadensis                   0.04     0.07   0.04         0.00       0.05
## H.peigleri                     0.01     0.04   0.01         0.05       0.00
## H.slosseri                     0.02     0.04   0.02         0.05       0.01
## H.sp.GreatLakesComplex         0.01     0.03   0.01         0.05       0.01
##                        H.slosseri H.sp.GreatLakesComplex
## H.latifascia                 0.02                   0.01
## H.lucina                     0.04                   0.03
## H.maia                       0.02                   0.01
## H.nevadensis                 0.05                   0.05
## H.peigleri                   0.01                   0.01
## H.slosseri                   0.00                   0.02
## H.sp.GreatLakesComplex       0.02                   0.00

Estimate individual inbreeding

Inbreeding represents the excess of homozygosity in an individual due to inheriting two identical alleles from recent common ancestor. It can lead to inbreeding depression so the associated loss of fitness, often due to recessive deleterious alleles that are more frequent than in the population. Ignore any warnings that may come up (not shown in output here)

genpop_data_inbreeding_mean <- sapply(inbreeding(genpop_data, N = 1000, M = 2000), mean) #compute mean inbreeding values for each individual (mean from the likelihood distribution of each individual)
hist(genpop_data_inbreeding_mean, main = "Average inbreeding in individuals") #plot distribution of (mean) inbreeding values for each individual

round(tail(sort(genpop_data_inbreeding_mean), n = 10), 2) #list ten individuals with highest mean inbreeding
##            bog_NY_DN2502        TX_Andrews_Hmg026       TX_Lampasas_Hmg080 
##                     0.51                     0.51                     0.51 
##      NM_Escondido_Hmg195           maia_MA_DN2543            nev_CA_DN2395 
##                     0.51                     0.52                     0.52 
## MN_CarlosAverySWA_Hmg324      WI_RomePond1_Hmg017              nev_NV_DR99 
##                     0.52                     0.52                     0.52 
##           maia_MA_DN2379 
##                     0.52
genpop_data_inbreeding_df <- data.frame(ID = names(genpop_data_inbreeding_mean), Inbreeding_Mean = genpop_data_inbreeding_mean, Population = genpop_data$pop)
ggplot(genpop_data_inbreeding_df, aes(x = Inbreeding_Mean)) + #plot distribution of (mean) inbreeding values for each individual for each population
  facet_wrap(~ Population, scales = "free_y") +
  geom_histogram(binwidth = 0.005, fill = "lightblue", color = "black", alpha = 0.8) +
  labs(x = "Individual mean inbreeding value", y = "Frequency") +
  theme_classic() +
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

genpop_data_inbreeding_df %>% group_by(Population) %>% top_n(5, Inbreeding_Mean) %>% arrange(Population, desc(Inbreeding_Mean)) %>% mutate(Inbreeding_Mean = round(Inbreeding_Mean, 2)) #list ten individuals with highest mean inbreeding
## # A tibble: 36 Ă— 3
## # Groups:   Population [8]
##    ID                       Inbreeding_Mean Population  
##    <chr>                              <dbl> <fct>       
##  1 MN_CarlosAverySWA_Hmg324            0.52 H.latifascia
##  2 maia_IA_DN2517                      0.51 H.latifascia
##  3 maia_IA_DN2480                      0.51 H.latifascia
##  4 MN_CarlosAverySWA_Hmg327            0.5  H.latifascia
##  5 NE_Mullen_Hmg042                    0.5  H.latifascia
##  6 MA_Montague_Hmg044                  0.51 H.lucina    
##  7 luc_CT_DN2565                       0.5  H.lucina    
##  8 luc_CT_DN2563                       0.5  H.lucina    
##  9 NH_CheshireCo_Hmg130                0.5  H.lucina    
## 10 NH_CheshireCo_Hmg129                0.5  H.lucina    
## # ℹ 26 more rows
Estimate linkage disequilibrium

Using corrected index of association rd (Agapow & Burt 2001) to estimate linkage between loci for each population (less biased version of original index of association by Brown et al. (1980) that accounts for number of sampled loci; it uses permutation test with null hypothesis of unlinked loci). r̄d near 0: very low linkage disequilibrium (loci mostly assort independently - random mating), r̄d 0.01-0.05: Moderate linkage disequilibrium (some degree of non-random association), r̄d > 0.05: Relatively high linkage disequilibrium (can indicate strong clonality, strong population structure, or selection); but numbers are relative so that they need to be compared between populations!

results_list <- list() #initialize empty list to store results for each population

## Loop over each population level in dataset
for (pop in levels(pop(genpop_data))) {
  
  ## Subset data for population
  pop_data <- genpop_data[pop(genpop_data) == pop, , drop = TRUE]
  n <- nInd(pop_data)
  
  ## Skip if fewer than 5 individuals
  if (n < 5) {results_list[[pop]] <- c(rbarD = NA, p.value = NA); next}
  
  ## Calculate standardized index of association (rbarD) with permutations
  ia_result <- poppr::ia(pop_data, sample = 100, index = "rbarD")
  
  ## Store rbarD and p-value
  results_list[[pop]] <- c(rbarD = ia_result["rbarD"], p.value = ia_result["p.rD"])
}

## Combine results, convert to data frame and show results
genpop_data_LD <- do.call(rbind, results_list)
colnames(genpop_data_LD) <- c("rbarD", "p.value")
genpop_data_LD <- data.frame(Population = rownames(genpop_data_LD), rd = as.numeric(genpop_data_LD[, "rbarD"]),
  p.value = as.numeric(genpop_data_LD[, "p.value"]), row.names = NULL)
genpop_data_LD
##               Population          rd    p.value
## 1           H.latifascia 0.012451220 0.00990099
## 2               H.lucina 0.023248025 0.00990099
## 3                 H.maia 0.009596439 0.00990099
## 4        H.maia.x.lucina          NA         NA
## 5           H.nevadensis 0.057168271 0.00990099
## 6             H.peigleri 0.009097995 0.00990099
## 7             H.slosseri 0.025596253 0.00990099
## 8 H.sp.GreatLakesComplex 0.012041593 0.00990099

4) Isolation-by-distance (IBD) using Mantel test

A) IBD among all individuals

Calculate Edwards’ genetic distance matrix between individuals (by transforming allele frequencies to square root values and compute Euclidean distance)
genetic_dist <- dist(sqrt(adegenet::tab(genpop_data, freq = T)), method = "euclidean")
Calculate geographic distance matrix (using Vincenty formula from geosphere package to compute distance between coordinates)
geo_dist <- geosphere::distm(as.matrix(dataset[, c("Longitude", "Latitude")]), fun = geosphere::distVincentySphere)
Perform Mantel test to compare genetic and geographic distances to evaluate the correlation between the two distance matrices
mantel_test <- ade4::mantel.randtest(as.dist(genetic_dist), as.dist(geo_dist), nrepet = 1000)
mantel_test #print Mantel test results
## Monte-Carlo test
## Call: ade4::mantel.randtest(m1 = as.dist(genetic_dist), m2 = as.dist(geo_dist), 
##     nrepet = 1000)
## 
## Observation: 0.3677038 
## 
## Based on 1000 replicates
## Simulated p-value: 0.000999001 
## Alternative hypothesis: greater 
## 
##       Std.Obs   Expectation      Variance 
## 20.0946833614 -0.0003455084  0.0003354669
plot(mantel_test) #plot results: black dot indicates observed correlation and histograms show permuted values

Plot IBD
combined_data <- data.frame(genetic_dist = reshape2::melt(as.matrix(genetic_dist))$value, #combine genetic and geographic distance matrices into one data frame
                            geo_dist = reshape2::melt(as.matrix(geo_dist))$value)
combined_data <- combined_data[combined_data$genetic_dist != 0 & combined_data$geo_dist != 0, ] #remove self-comparisons where distance is 0

svg("IBD across all individuals.svg", width = 25/2.56, height = 15/2.56)
ggplot(combined_data, aes(x = geo_dist / 1000, y = genetic_dist)) + #plot IBD
  geom_point(alpha = 0.2, color = "black", size = 1.6) + #scatter plot with points
  geom_smooth(method = "lm", color = "orange", linewidth = 1.8) + #linear model fit
  labs(x = "Geographic distance (km)", y = "Genetic distance (Edward's)", title = "Isolation-by-distance") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5), #center plot title
        axis.text = element_text(color = "black")) #color axis text
dev.off()
## png 
##   2

B) IBD among individuals within each population

Prepare data
population_subsets <- list() #define list to store subsets for each population
populations <- unique(genpop_data@pop) #extract unique populations from genind object
for (pop in populations) { #create subsets for each population
  pop_genind_subset <- genpop_data[genpop_data@pop == pop, ] #subset genind for population
  pop_coords_subset <- dataset[dataset$Species == pop, ] #subset coordinates for population
  population_subsets[[pop]] <- list(genind_obj = pop_genind_subset, coords_df = pop_coords_subset)} #store subsets in list
Create function to compute distance matrices and run Mantel test for each population
calc.dist.mat.and.run.Mantel.test <- function(genind_obj, coords_df, N_Mantel = 10000, min_ind = 5) {
  if (nrow(coords_df) < min_ind) { #check if population has more than 3 individuals
    cat("Skipping population", unique(genind_obj@pop), "due to fewer than", min_ind, "individuals.\n")
    return(NULL)}  #skip populations with 3 or fewer individuals
  genetic_dist <- tryCatch({ #calculate genetic distance (Edwards' distance)
    dist(sqrt(adegenet::tab(genind_obj, freq = T)), method = "euclidean")
  }, error = function(e) {cat("Error in genetic distance calculation:", e$message, "\n")
    return(NULL)})
  geo_dist <- tryCatch({ #calculate geographic distance (Vincenty distance)
    geosphere::distm(as.matrix(coords_df[, c("Longitude", "Latitude")]), fun = geosphere::distVincentySphere)
  }, error = function(e) {cat("Error in geographic distance calculation:", e$message, "\n")
    return(NULL)})
  mantel_test <- NULL #perform Mantel test if both distance matrices are available
  if (!is.null(genetic_dist) && !is.null(geo_dist)) {
    mantel_test <- tryCatch({
      ade4::mantel.randtest(as.dist(genetic_dist), as.dist(geo_dist), nrepet = N_Mantel)
    }, error = function(e) {cat("Error in Mantel test:", e$message, "\n")
      return(NULL)})
  } else {cat("Skipping Mantel test due to NULL distance matrices.\n")}
  plot_data <- NULL #prepare plot data and generate plot if distance matrices are available
  combined_data <- NULL #initialize combined_data to store genetic and geographic distances
  if (!is.null(genetic_dist) && !is.null(geo_dist)) {
    genetic_dist_matrix <- as.matrix(genetic_dist)
    geo_dist_matrix <- as.matrix(geo_dist)
    combined_data <- data.frame( #combine matrices for plotting
      genetic_dist = reshape2::melt(genetic_dist_matrix)$value,
      geo_dist = reshape2::melt(geo_dist_matrix)$value,
      Population = unique(genind_obj@pop)) #add population information
    combined_data <- combined_data[combined_data$genetic_dist != 0 & combined_data$geo_dist != 0, ] #remove self-comparisons
    mantel_p_value <- ifelse(is.null(mantel_test), NA, mantel_test$pvalue) #get Mantel p-value
    plot_title <- paste("Isolation-by-distance for", unique(genind_obj@pop), "\nMantel test p-value:", format(mantel_p_value, digits = 3)) #title with p-value
    plot_data <- ggplot(combined_data, aes(x = geo_dist / 1000, y = genetic_dist)) + #plot genetic vs geographic distance
      geom_point(alpha = 0.5, color = "black", size = 1.8) + #scatter plot with points
      geom_smooth(method = "loess", color = "darkgrey", linewidth = 0.9) + #LOESS smooth fit
      geom_smooth(method = "lm", color = "orange", linewidth = 1.5) +  #linear model fit
      labs(x = "Geographic distance (km)", y = "Genetic distance (Edwards')", title = plot_title) + #include plot title with p-value
      theme_classic() +
      theme(plot.title = element_text(hjust = 0.5), #center plot title
            axis.text = element_text(color = "black")) #color axis text
  } else {cat("Skipping plot due to missing distance matrices.\n")}
  return(list(mantel_test = mantel_test, plot_data = plot_data, combined_data = combined_data, num_individuals = nrow(coords_df)))} #return combined_data
results_list <- list() #initialize results list to store Mantel test results and plots
results_df <- data.frame(Population = character(), Num_Individuals = integer(), #initialize dataframe to store results
                         Mantel_p_value = numeric(), stringsAsFactors = F)
all_data <- data.frame() #initialize dataframe to store combined data for all populations
Loop through each population and perform analysis

Ignore any warnings that may come up (not shown in output here)

results_list <- list() #initialize results list to store Mantel test results and plots
results_df <- data.frame(Population = character(), Num_Individuals = integer(), #initialize dataframe to store results
                         Mantel_p_value = numeric(), stringsAsFactors = F)
all_data <- data.frame() #initialize dataframe to store combined data for all populations

for (pop in names(population_subsets)) {
  cat("Analyzing population:", pop, "\n")
  pop_data <- population_subsets[[pop]]$genind_obj #extract genind object for population
  pop_coords <- population_subsets[[pop]]$coords_df #extract coordinates dataframe for population
  results <- calc.dist.mat.and.run.Mantel.test(genind_obj = pop_data, coords_df = pop_coords) #analyze population
  if (!is.null(results)) { #store results if available
    results_list[[pop]] <- results
        results_df <- rbind(results_df, data.frame(
      Population = pop,
      Num_Individuals = results$num_individuals,
      Mantel_obs = ifelse(is.null(results$mantel_test), NA, results$mantel_test$obs),
      Mantel_p_value = ifelse(is.null(results$mantel_test), NA, results$mantel_test$pvalue),
      stringsAsFactors = FALSE)) 
    if (!is.null(results$combined_data)) {
      all_data <- rbind(all_data, results$combined_data)} #append combined data for each population
    if (!is.null(results$plot_data)) { #save plot if available
      print(results$plot_data)}}}
## Analyzing population: H.maia

## Analyzing population: H.latifascia

## Analyzing population: H.sp.GreatLakesComplex

## Analyzing population: H.lucina

## Analyzing population: H.maia.x.lucina 
## Skipping population 1 due to fewer than 5 individuals.
## Analyzing population: H.nevadensis

## Analyzing population: H.slosseri

## Analyzing population: H.peigleri

Save result dataframe as csv file
write.csv(results_df, "Mantel_test_results.csv", row.names = F)

5) PCA (Principal component analysis)

Genetic clusters are inferred through eigenvector decomposition of the allele frequencies (matrices) among individuals, aiming to reduce the complexity of genetic data by projecting it into a lower-dimensional space with each axis (principal component) capturing a portion of the variance in the allele frequencies (Patterson et al. 2006, Reich et al. 2008).

Define plotting parameters
cols_pop <- viridis::magma(n = length(levels(population_assignment)), begin = 1, end = 0) #define color palette
shapes_pop <- rep(c(21, 24, 22, 25), length.out = length(cols_pop)) #define shape palette
plot_width <- 25/2.56
plot_height <- 15/2.56
point_size <- 4.1
point_alpha <- 0.9
point_outline_color <- "black"
point_alpha_3D_plot <- 1
point_size_3D_plot <- 7
population_assignment_name <- "Species"
Define datasets to be used for PCA
pop_data_datasets <- list(genpop_data = list(data = genpop_data))
Create function to handle missing and infinite values
handle.NA.inf.values <- function(data) {
  data$tab <- as.matrix(data$tab)
  data$tab <- data$tab[, colSums(is.na(data$tab) | is.infinite(data$tab)) < nrow(data$tab)]
  data$tab <- apply(data$tab, 2, function(column) {
    column_mean <- mean(column, na.rm = T)
    column[is.na(column) | is.infinite(column)] <- column_mean
    return(column)})
  return(data)}
Create function to perform PCA
run.pca <- function(data) {
  data <- handle.NA.inf.values(data)
  prcomp(data$tab)}
Create function to calculate proportion of variance explained by each PC
calc.variance.explained <- function(pca_result) {
  (pca_result$sdev^2 / sum(pca_result$sdev^2)) * 100}
Create function to calculate number of PCs that explain at least 1% of variance
calc.N.pcs.above.threshold <- function(var_expl_perc, threshold = 1) {
  length(which(var_expl_perc >= threshold))}
Create function to create data frames for scree plot
create.scree.plot.df <- function(var_expl_perc) {
  data.frame(PC = seq_along(var_expl_perc), VarianceExplained = var_expl_perc)}
Create function to create PCA data frames for plotting (retain 60% of total PCs)
create.pca.df <- function(pca_result, group) {
  num_pcs <- ncol(pca_result$x)  # total number of PCs
  num_pcs_to_keep <- ceiling(num_pcs * 0.5) #keep 50% of total PCs
  data.frame(pca_result$x[, 1:num_pcs_to_keep], group = group)}
Create function to plot scree plots with threshold line
plot.scree.plot <- function(scree_plot_df, threshold) {
  num_pcs <- nrow(scree_plot_df)  #total number of PCs
  num_pcs_to_keep <- ceiling(num_pcs * 0.5) #keep 50% of total PCs
  ggplot(scree_plot_df[1:num_pcs_to_keep, ], aes(x = PC, y = VarianceExplained)) +
    geom_bar(stat = "identity", fill = "grey") +
    geom_line(aes(y = cumsum(VarianceExplained)), color = "orange") +
    geom_point(aes(y = cumsum(VarianceExplained)), color = "orange", size = 2) +
    labs(x = "PC", y = "Explained variance (%)") +
    geom_hline(yintercept = threshold, color = "black", linetype = "dashed") +
    theme_classic()}
Run PCA and plot scree plots
run.pca.for.datasets <- function(datasets) {
  lapply(datasets, function(subset) {
    pca_result <- run.pca(subset$data)
    var_expl_perc <- calc.variance.explained(pca_result)
    num_pcs <- calc.N.pcs.above.threshold(var_expl_perc)
    scree_plot_df <- create.scree.plot.df(var_expl_perc)
    print(plot.scree.plot(scree_plot_df, threshold = 1))
    list(pca_result = pca_result, variance_explained = var_expl_perc, num_pcs_above_1perc = num_pcs)})}
Run PCAs
pca_results <- run.pca.for.datasets(pop_data_datasets)

Plot PCA
pca_df_genpop_data <- create.pca.df(pca_results$genpop_data$pca_result, 
                                    factor(genpop_data$pop, levels = unique(population_assignment)))

svg("PCA_PC1_PC2_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(pca_df_genpop_data, aes(x = PC1, y = PC2)) + #PCA scatterplot of PC1 and PC2
  geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group), 
             color = point_outline_color) +
  scale_shape_manual(values = shapes_pop) +
  scale_fill_manual(values = cols_pop) +
  labs(x = sprintf("PC1 (%.2f%%)", pca_results$genpop_data$variance_explained[1]),
       y = sprintf("PC2 (%.2f%%)", pca_results$genpop_data$variance_explained[2]),
       shape = population_assignment_name,
       fill = population_assignment_name) +
  theme_classic() +
  guides(override.aes = list(alpha = 1))
dev.off()

svg("PCA_PC1_PC3_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(pca_df_genpop_data, aes(x = PC1, y = PC3)) + #PCA scatterplot of PC1 and PC3
  geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group), 
             color = point_outline_color) +
  scale_shape_manual(values = shapes_pop) +
  scale_fill_manual(values = cols_pop) +
  labs(x = sprintf("PC1 (%.2f%%)", pca_results$genpop_data$variance_explained[1]),
       y = sprintf("PC3 (%.2f%%)", pca_results$genpop_data$variance_explained[2]),
       shape = population_assignment_name,
       fill = population_assignment_name) +
  theme_classic() +
  guides(override.aes = list(alpha = 1))
dev.off()

plotly::plot_ly( #3D plot of PCA using PC1-PC3
  data = pca_df_genpop_data,
  x = ~PC1,
  y = ~PC2,
  z = ~PC3,
  color = ~group, colors = cols_pop,
  type = 'scatter3d', mode = 'markers',
  alpha = point_alpha_3D_plot, #marker transparency
  marker = list(size = point_size_3D_plot)) #marker size

6) DAPC - Discriminant analysis of principle components (Jombart et al. 2010)

Overview
Discriminant analysis of principal components (DAPC) is a multivariate method designed to identify and describe genetic clusters by determining the optimal number of clusters (k) that best summarizes the data. It is faster than traditional Bayesian clustering methods and does not assume Hardy-Weinberg Equilibrium (HWE) or linkage between loci. It maximizes variation between groups while minimizing variation within them, offering a distinct advantage over methods like PCA, PCoA, and MANOVA (which focus on global diversity but neglect group-specific differences). Furthermore, it is potentially more powerful and accurate in phylogenetic and hierarchical analyses (including hybridization) than methods that minimize HWE and linkage disequilibrium (e.g., STRUCTURE) (Kanno et al 2011, Dupuis and Sperling 2015, Jombart et al. 2010).
Approach
DAPC first transforms the data through principal component analysis (PCA) to ensure the variables submitted to discriminant analysis (DA) are uncorrelated and that their number is less than the number of analyzed individuals (in contrast to a standard DA), and then then applies DA to the retained principal components, aiming to discriminate between predefined groups. The eigenvalues in a DAPC represent the ratio of variance between groups over the variance within groups for each discriminant function, indicating the discriminant power of each axis in a DAPC plot.
How many PCs to retain?
Choosing the optimal number of retained principal components (PCs) is crucial for the effectiveness of DAPC. Retaining too many components can lead to overfitting and instability in membership probabilities, while retaining too few may result in a loss of valuable information. This can be done using the a-score (representing the difference between the proportion of successful reassignments observed in the analysis and values obtained using random groupings) or cross-validation. The latter optimization procedure is most suitable and finds the optimal number of retained PCs by splitting the data into two sets: a training (usually 90% of the data) and a validation set (typically the remaining 10%). The validation set is selected using stratified random sampling, ensuring that every group or population from the original dataset is represented in both sets. DAPC is then performed on the training set while retaining different numbers of PCs. The method then assesses how well the analysis can predict group membership for the individuals in the validation set. This helps to identify the optimal number of PCs to retain. The process is repeated multiple times to improve accuracy. The final number of PCs used in the analysis should be documented for transparency, along with evidence supporting the choice of k (Miller et al. 2020).
A priori vs de novo clustering
DAPC can be performed either using a priori defined groupings (e.g., populations/species) (similar to a PCA) or it can identify genetic clusters (k) de novo (e.g., similar to Structure). This decision can affect the DAPC results and is therefore important to consider (Miller et al. 2020). Misspecification of groupings can lead to artificially large populations with Wahlund-like effects of apparent depressed heterozygosity (Wahlund 1928), with these inflated population size estimates preventing conservation and increasing risk of extinction for one of “cryptic” genetic clusters, or it can lead to over splitting populations that should be combined. De novo group assignments can be inaccurate under conditions of high migration rates or low genetic differentiation but is mostly reliable when migration rates are low (Miller et al. 2020). For a priori assessments, the genetic distance between clusters reflects underlying FST values (Miller et al. 2020). Similar to other methods, artefactual discrete groups may be identified in populations with continuously distributed (cline-like) genetic diversity and under spatially heterogeneous sampling of populations. However, incorporating scatterplots for a graphical assessment of the genetic structures between clusters can allow to assess the organization of genetic variability and reveal potential clines.
k-means clustering
When grouping information is unknown, DAPC can incorporate k-means clustering to identify the optimal number of clusters k (Lee et al. 2009, Liu & Zhao 2006). The k-means algorithm maximizes variation between groups, and the optimal k is determined by comparing clustering solutions using the Bayesian Information Criterion (BIC), selecting the lowest BIC value.
Additional features
DAPC also provides membership probabilities for each individual, which are based on the retained discriminant functions. These probabilities differ from admixture coefficients in STRUCTURE but can similarly be interpreted as reflecting the proximity of individuals to different genetic clusters. They also offer insights into the distinctiveness of identified clusters and potential admixture events. This feature makes DAPC a potentially more powerful and accurate method for analyzing phylogenetic relationships, hierarchical structures, and hybridization events compared to methods like STRUCTURE, which minimize assumptions about HWE and linkage disequilibrium (Kanno et al. 2011, Dupuis and Sperling 2015, Jombart et al. 2010). Additionally, DAPC allows users to identify and plot alleles that contribute most significantly to the discrimination of clusters, providing further insight into genetic differentiation.

Define number of cross-validation runs

Use larger number here (e.g., 200)

crossval_N <- 20 #number of cross-validation runs

A) DAPC with prior grouping

Replace NA values and format genotype matrix
genpop_data@tab <- matrix(
  as.integer(round(apply(genpop_data@tab, 2, function(x) {
    x[is.na(x)] <- mean(x, na.rm = T) #replace NA with column means
    return(x)}))), #return modified column
  nrow = nrow(genpop_data@tab), #number of rows
  ncol = ncol(genpop_data@tab), #number of columns
  dimnames = dimnames(genpop_data@tab)) #preserve original dimensions
Remove groups with fewer than 2 members and adjust color and shapes scales accordingly
original_populations <- unique(genpop_data@pop) #identify populations before filtering
genpop_data_DAPC <- genpop_data[adegenet::pop(genpop_data) %in% names(table(adegenet::pop(genpop_data))[table(adegenet::pop(genpop_data)) > 1]), ] #filter populations with fewer than 2 members
remaining_populations <- unique(genpop_data_DAPC@pop) #identify remaining populations
removed_populations <- setdiff(original_populations, remaining_populations) #identify removed populations
if (length(removed_populations) > 0) { #check if any populations were removed
  cols_pop_DAPC <- cols_pop[-(which(original_populations %in% removed_populations))] #remove colors for removed populations
  shapes_pop_DAPC <- shapes_pop[-which(original_populations %in% removed_populations)] #remove shapes for removed populations
} else { #no populations were removed so keep original color and shape assignments
  cols_pop_DAPC <- cols_pop
  shapes_pop_DAPC <- shapes_pop}
Perform DAPC
dapc_result_genpop_data <- adegenet::dapc(
  genpop_data_DAPC, pop = adegenet::pop(genpop_data_DAPC),
  n.pca = as.numeric(optimal_pcs_genpop_data_DAPC_run2),
  n.da = length(levels(genpop_data_DAPC$pop)) - 1) #perform DAPC with optimal PCs (determined by crossvalidation)
Save (and load) results of crossvalidation and DAPC
save(xval_results_genpop_data_DAPC_run1, optimal_pcs_genpop_data_DAPC_run1, 
     xval_results_genpop_data_DAPC_run2, optimal_pcs_genpop_data_DAPC_run2,
     dapc_result_genpop_data,
     file = "DAPC_results_genpop_data.Rdata")
load("DAPC_results_genpop_data.Rdata")
Evaluate DAPC results
par(mar = c(5, 5, 3, 3))
print(round(dapc_result_genpop_data$var, 2) * 100) #conserved variance in percent
## [1] 64
barplot(dapc_result_genpop_data$eig, names.arg = paste0("LD", seq_along(dapc_result_genpop_data$eig)), xlab = "Linear discriminants", ylab = "Eigenvalues") #plot eigenvalues of discriminant functions

round(dapc_result_genpop_data$eig, 1) #show eigenvalues of discriminant functions
## [1] 2597.9 1394.0  611.9  511.6  138.8   72.6
dapc_relative_LDs <- round(dapc_result_genpop_data$eig / sum(dapc_result_genpop_data$eig) * 100, 1)
barplot(dapc_relative_LDs, #plot relative proportions of eigenvalues of discriminant functions
        names.arg = paste0("LD", seq_along(dapc_result_genpop_data$eig)), 
        xlab = "Linear discriminants", ylab = "Relative proportion of eigenvalues [%]")

Plot DAPC results
LD1_label <- paste("LD1 (", dapc_relative_LDs[1], "%)", sep = "") #create axis labels for LD1 showing relative proportions of eigenvalues of discriminant functions in percent
LD2_label <- paste("LD2 (", dapc_relative_LDs[2], "%)", sep = "") #create axis labels for LD2 showing relative proportions of eigenvalues of discriminant functions in percent
LD3_label <- paste("LD3 (", dapc_relative_LDs[3], "%)", sep = "") #create axis labels for LD3 showing relative proportions of eigenvalues of discriminant functions in percent
dapc_df_genpop_data <- data.frame(dapc_result_genpop_data$ind.coord, group = factor(dapc_result_genpop_data$grp, levels = levels(population_assignment))) #create dataframe for visualization

svg("DAPC_LD1_LD2_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(dapc_df_genpop_data, aes(x = LD1, y = LD2)) + #DAPC scatterplot of LD1 vs LD2
  geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group), color = point_outline_color) +
  scale_shape_manual(values = shapes_pop_DAPC) +
  scale_fill_manual(values = cols_pop_DAPC) +
  labs(shape = population_assignment_name, fill = population_assignment_name, x = LD1_label, y = LD2_label) +
  theme_classic() +
  guides(override.aes = list(alpha = 1))
dev.off()

svg("DAPC_LD1_LD3_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(dapc_df_genpop_data, aes(x = LD1, y = LD3)) + #DAPC scatterplot of LD1 vs LD3
  geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group), color = point_outline_color) +
  scale_shape_manual(values = shapes_pop_DAPC) +
  scale_fill_manual(values = cols_pop_DAPC) +
  labs(shape = population_assignment_name, fill = population_assignment_name, x = LD1_label, y = LD3_label) +
  theme_classic() +
  guides(override.aes = list(alpha = 1))
dev.off()

plotly::plot_ly( #3D plot of DAPC using LD1-LD3
  data = dapc_df_genpop_data,
  x = ~LD1, y = ~LD2, z = ~LD3,
  color = ~group, colors = cols_pop_DAPC,
  type = 'scatter3d', mode = 'markers',
  alpha = point_alpha_3D_plot, #marker transparency
  marker = list(size = point_size_3D_plot)) #marker size
Create Structure-like plot showing cluster posterior probabilities for individuals across populations
dapc_group_memberships <- as.data.frame(dapc_result_genpop_data$posterior) #convert posterior probabilities to data frame
dapc_group_memberships$ID <- row.names(dapc_group_memberships) #add ID column
dapc_group_memberships$Population <- genpop_data$pop[match(dapc_group_memberships$ID, indNames(genpop_data))] #match populations to IDs
dapc_group_memberships_long <- melt(dapc_group_memberships, id.vars = c("ID", "Population"), variable.name = "Cluster", value.name = "Probability") #reshape to long format

svg("DAPC_membership_probabilities.svg", width = 35/2.56, height = 30/2.56)
ggplot(dapc_group_memberships_long, aes(x = Probability, y = ID, fill = Cluster)) + #create Structure-like plots 
  geom_bar(stat = "identity") + 
  theme_classic() +
  scale_fill_manual(values = cols_pop_DAPC) + #use viridis color palette
  facet_wrap(~ Population, scales = "free_y") + #split y-axis by population
  labs(fill = "Cluster") +
  theme(axis.text.y = element_text(angle = 0, hjust = 1, size = 4.1), #modify size of y-axis labels
        axis.text.x = element_blank(), #remove x-axis labels
        axis.title.y = element_blank(), #remove y-axis title
        axis.title.x = element_blank(), #remove x-axis title
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), 
        panel.background = element_blank(), 
        plot.background = element_blank())
dev.off()

Assess allele differentiation
dapc_loading_LD1 <- adegenet::loadingplot(dapc_result_genpop_data$var.contr[, 1], lab.jitter = 100, #plot loading for first LD axis 
                                          thres = quantile(dapc_result_genpop_data$var.contr[, 1], 0.99)) #set threshold to 99th percentile (top 1% of loadings)

dapc_loading_LD2 <- adegenet::loadingplot(dapc_result_genpop_data$var.contr[, 2], lab.jitter = 100, #plot loading for second LD axis 
                                          thres = quantile(dapc_result_genpop_data$var.contr[, 2], 0.99)) #set threshold to 99th percentile (top 1% of loadings)

dapc_loading_LD1$var.names #print most differentiated alleles for LD1
##  [1] "209826:27:+.0"  "209826:27:+.1"  "347604:17:+.0"  "347604:17:+.1" 
##  [5] "548637:28:+.0"  "548637:28:+.1"  "607571:65:-.0"  "607571:65:-.1" 
##  [9] "623498:13:+.0"  "623498:13:+.1"  "647037:10:+.0"  "647037:10:+.1" 
## [13] "686382:4:-.1"   "697527:53:-.0"  "697527:53:-.1"  "707554:19:-.0" 
## [17] "707554:19:-.1"  "750141:18:-.0"  "750141:18:-.1"  "761132:51:-.0" 
## [21] "761132:51:-.1"  "762990:31:+.0"  "762990:31:+.1"  "917171:7:+.0"  
## [25] "917171:7:+.1"   "927164:6:+.0"   "927164:6:+.1"   "959252:10:+.0" 
## [29] "959252:10:+.1"  "1021247:13:-.1" "1021247:13:-.0" "1034909:8:-.0" 
## [33] "1034909:8:-.1"  "1050155:10:-.0" "1050155:10:-.1" "1079236:33:-.0"
## [37] "1079236:33:-.1" "1084025:16:-.0" "1084025:16:-.1" "1179508:42:-.0"
## [41] "1179508:42:-.1" "1294262:5:+.0"  "1294262:5:+.1"  "1315236:34:-.0"
## [45] "1315236:34:-.1" "1697906:21:+.0" "1697906:21:+.1" "1697966:7:-.0" 
## [49] "1697966:7:-.1"  "1702204:25:+.0" "1702204:25:+.1" "1739335:15:+.0"
## [53] "1739335:15:+.1" "1748724:12:+.0" "1748724:12:+.1" "1875485:36:+.0"
## [57] "1875485:36:+.1" "2004857:55:-.1" "2004857:55:-.0" "2011745:23:-.0"
## [61] "2011745:23:-.1" "2127415:13:+.0" "2127415:13:+.1" "2147586:28:-.0"
## [65] "2147586:28:-.1" "2202785:6:-.0"  "2202785:6:-.1"  "2702045:14:+.0"
## [69] "2702045:14:+.1" "2716161:8:-.0"  "2716161:8:-.1"  "2816714:11:-.0"
## [73] "2816714:11:-.1" "2841083:17:-.1" "2841083:17:-.0" "2939340:6:+.0" 
## [77] "2939340:6:+.1"  "3134272:9:+.0"  "3134272:9:+.1"  "3177228:11:+.0"
## [81] "3177228:11:+.1" "3278150:15:+.0" "3278150:15:+.1" "3293069:11:+.0"
## [85] "3293069:11:+.1" "3322807:5:-.0"  "3322807:5:-.1"  "3361896:11:+.0"
## [89] "3361896:11:+.1" "3394277:14:+.0" "3394277:14:+.1" "3408317:12:-.0"
## [93] "3408317:12:-.1"
dapc_loading_LD2$var.names #print most differentiated alleles for LD2
##  [1] "131926:18:-.0"  "131926:18:-.1"  "151044:32:-.0"  "151044:32:-.1" 
##  [5] "214427:4:-.0"   "214427:4:-.1"   "289798:19:-.0"  "289798:19:-.1" 
##  [9] "320857:36:+.0"  "320857:36:+.1"  "383196:13:-.1"  "383196:13:-.0" 
## [13] "394990:26:+.0"  "394990:26:+.1"  "477702:9:-.0"   "477702:9:-.1"  
## [17] "621013:10:-.0"  "621013:10:-.1"  "647037:10:+.0"  "647037:10:+.1" 
## [21] "683141:30:-.0"  "683141:30:-.1"  "747425:5:+.0"   "747425:5:+.1"  
## [25] "843409:4:-.0"   "843409:4:-.1"   "963094:5:+.0"   "963094:5:+.1"  
## [29] "1097445:13:-.0" "1097445:13:-.1" "1264640:11:+.0" "1264640:11:+.1"
## [33] "1265194:10:+.0" "1265194:10:+.1" "1274354:6:-.0"  "1370346:14:-.0"
## [37] "1370346:14:-.1" "1377033:39:-.0" "1377033:39:-.1" "1383308:16:-.0"
## [41] "1383308:16:-.1" "1444380:14:-.0" "1444380:14:-.1" "1613775:9:+.1" 
## [45] "1613775:9:+.0"  "1666926:22:+.0" "1666926:22:+.1" "1702076:29:-.1"
## [49] "1702076:29:-.0" "1757045:30:-.0" "1757045:30:-.1" "1783088:9:+.0" 
## [53] "1783088:9:+.1"  "1784216:10:-.0" "1784216:10:-.1" "1882276:9:-.0" 
## [57] "1882276:9:-.1"  "1912415:4:+.0"  "1912415:4:+.1"  "2028920:8:+.0" 
## [61] "2028920:8:+.1"  "2095604:4:+.0"  "2095604:4:+.1"  "2151331:14:-.0"
## [65] "2151331:14:-.1" "2175590:6:-.0"  "2175590:6:-.1"  "2186599:17:-.0"
## [69] "2186599:17:-.1" "2393970:12:+.0" "2393970:12:+.1" "2486135:9:-.0" 
## [73] "2486135:9:-.1"  "2511053:6:-.0"  "2511053:6:-.1"  "2625995:34:+.0"
## [77] "2625995:34:+.1" "2692085:33:+.0" "2692085:33:+.1" "2871045:6:+.0" 
## [81] "2871045:6:+.1"  "2971377:6:-.0"  "2971377:6:-.1"  "3059401:17:+.0"
## [85] "3059401:17:+.1" "3167391:9:-.0"  "3167391:9:-.1"  "3306116:7:+.0" 
## [89] "3306116:7:+.1"  "3402547:9:+.0"  "3402547:9:+.1"  "3421689:6:+.0" 
## [93] "3421689:6:+.1"
intersect(dapc_loading_LD1$var.names, dapc_loading_LD2$var.names) #show differentiated alleles for both LD1 and LD2
## [1] "647037:10:+.0" "647037:10:+.1"
Assess and visualize group memberships
dapc_group_memberships <- as.data.frame(dapc_result_genpop_data$posterior) #convert posterior probabilities to data frame
subset_size <- 40 #define subset size
plot_assignments <- function(dapc_result, subset_size, max_rows) { #plot group memberships (heat colors represent membership probabilities with red=1 and white=0, blue crosses represent prior Cluster_assignment provided to DAPC)
  num_plots <- ceiling(max_rows / subset_size) #calculate number of plots needed
  for (i in 0:(num_plots - 1)) {
    start_row <- i * subset_size + 1 #start index for subset
    end_row <- min((i + 1) * subset_size, max_rows) #end index for subset
    cat("Plotting rows", start_row, "to", end_row, "\n") #print status
    adegenet::assignplot(dapc_result, subset = start_row:end_row)}} #plot assignments for current subset
par(mar = c(5, 13, 3, 3))
plot_assignments(dapc_result_genpop_data, subset_size, nrow(dapc_group_memberships)) #plot all subsets
## Plotting rows 1 to 40

## Plotting rows 41 to 80

## Plotting rows 81 to 118

Identify and plot admixed individuals
dapc_result_genpop_data$grp <- factor(dapc_result_genpop_data$grp, levels = unique(genpop_data_DAPC$pop))
admixed_individuals <- which(apply(dapc_result_genpop_data$posterior, 1, function(e) all(e < 0.80))) #identify admixed individuals (posterior probabilities less than 0.80)
admixed_individuals #show admixed individuals
## named integer(0)
if (length(admixed_individuals) > 0) {
  adegenet::compoplot(dapc_result_genpop_data, col.pal = cols_pop_DAPC, subset = admixed_individuals)}
Create function for DAPC mapping
## Create function to plot sample map with cluster assignment for each individual
plot.Map.assig.prob <- function(ancestry_matrix,
                                Coordinates, #coordinates as "Latitude" and "Longitude" columns in dataframe/matrix
                                lat.buffer.range = 2, #add coordinates as buffer range around latitude coordinates
                                lon.buffer.range = 2, #add coordinates as buffer range around longitude coordinates
                                save = F, #whether to save plot or not
                                overwrite = T, #whether to overwrite if file exists
                                file.name = NULL, #plot file name (NULL = default file name)
                                type = "svg", #plot format type (options: "png", "svg", "jpg")
                                width = 15, #plot width in cm
                                height = 20, #plot height in cm
                                resolution = 300, #plot resolution in dpi
                                pie.size = 1.9, #pie chart size
                                pie.col.pal, #color palette of pie charts
                                USA.add.states = T, #option to add US states (only works if range includes USA)
                                USA.add.counties = F, #option to add US counties (only works if range includes USA)
                                USA.state.lwd = 0.5, #linewidth of US state borders (only works if range includes USA)
                                USA.county.lwd = 0.3, #linewidth of US county borders (only works if range includes USA)
                                north.arrow.position = c(0.03, 0.88), #position (x, y) of north arrow relative to map
                                north.arrow.length = 0.7, #length of north arrow
                                north.arrow.lwd = 2, #linewidth of north arrow
                                north.arrow.N.position = 0.3, #position of north arrow "N"
                                north.arrow.N.size = 1, #size of north arrow "N"
                                scale.position = c(0.03, 0.05), #relative position (x, y) of scale
                                scale.size = 0.16, #size of scale
                                scale.font.size = 0.54, #font size of scale text
                                legend.position = "topright", #position of legend
                                legend.cluster.names = NULL, #names of clusters in legend (if NULL, default is used, otherwise use vector with length of number of clusters)
                                legend.font.size = 1, #font size of legend text
                                legend.box = T, #create white box around legend
                                legend.symbol.size = 1.5) { #size of legend symbols
  
  # Ensure ancestry_matrix is valid
  if (is.null(ancestry_matrix) || !is.matrix(ancestry_matrix)) {
    stop("ancestry_matrixis not valid")
  }
  
  # Check if Coordinates is data frame or matrix
  if (!is.data.frame(Coordinates) && !is.matrix(Coordinates)) {
    stop("Coordinates must be a data frame or matrix")
  }
  
  # Convert matrix to data frame if necessary
  if (is.matrix(Coordinates)) {
    Coordinates <- as.data.frame(Coordinates)
  }
  
  # Check if Coordinates contain Latitude and Longitude
  if (!all(c("Latitude", "Longitude") %in% names(Coordinates))) {
    stop("Coordinates must contain 'Latitude' and 'Longitude' columns")
  }
  
  # Check for NA values in Latitude and Longitude
  if (any(is.na(Coordinates$Latitude)) || any(is.na(Coordinates$Longitude))) {
    stop("Coordinates must not contain NA values in 'Latitude' or 'Longitude' columns")
  }
  
  # Prepare ancestry matrix
  q_matrix = as.data.frame(ancestry_matrix) #convert ancestry_matrix to dataframe
  q_matrix$Ind = rownames(ancestry_matrix) #add sample names
  
  # Check if number of rows in ancestry matrix matches number of coordinates
  if (nrow(q_matrix) != nrow(Coordinates)) {
    stop("Number of samples in ancestry_matrix does not match number of samples in Coordinates")
  }
  
  # Define color palette for pie charts
  k.cols = pie.col.pal
  
  # Define map boundaries
  lon_min = min(Coordinates$Longitude) - lon.buffer.range
  lon_max = max(Coordinates$Longitude) + lon.buffer.range
  lat_min = min(Coordinates$Latitude) - lat.buffer.range
  lat_max = max(Coordinates$Latitude) + lat.buffer.range
  
  # Create plot
  if (dev.cur() > 1) {
    dev.off() #close current device if open to avoid unwanted graphic distortions and other effects
  }
  
  # Set plot saving
  if (save) {
    
    # Set file name for saving
    if (is.null(file.name)) {
      file.name <- paste0("Assignment_probability_map_plot.", type)
    }
    
    # Set overwriting 
    if (file.exists(file.name) && !overwrite) {
      stop(paste(file.name, "already exists - set overwrite = T to overwrite"))
    }
    
    # Set file type
    if (type == "svg") {
      svg(file.name, 
          width = width / 2.54, 
          height = height / 2.54)
    } else if (type == "png") {
      png(file.name, 
          width = width, 
          height = height, 
          res = resolution, 
          units = "cm")
    } else if (type == "jpg") {
      jpeg(file.name, 
           width = width, 
           height = height, 
           res = resolution, 
           units = "cm")
    } else {
      stop("Unsupported plot file type - choose from 'svg', 'png', or 'jpg'")
    }
  }
  
  # Set layout and margins
  par(mfrow = c(1, 1),
      mar = c(1, 1, 1, 1))
  
  # Create map
  maps::map("world", 
            fill = T, 
            col = "lightgrey", 
            xlim = c(lon_min, lon_max), 
            ylim = c(lat_min, lat_max))
  maps::map.axes()
  
  # Add US counties if requested
  if (USA.add.counties) {
    maps::map("county", 
              add = T, 
              col = "grey", 
              lwd = USA.county.lwd)
  }
  
  # Add US states if requested
  if (USA.add.states) {
    maps::map("state",
              add = T, 
              col = "black", 
              lwd = USA.state.lwd)
  }
  
  # Add pie charts to map
  for (i in 1:nrow(q_matrix)) {
    coords = matrix(c(Coordinates$Longitude[i], Coordinates$Latitude[i]), ncol = 2, byrow = T)
    make.admix.pie.plot(admix.proportions = as.matrix(q_matrix[i, -ncol(q_matrix), drop = F]),
                        coords = coords,
                        layer.colors = k.cols,
                        radii = pie.size,
                        add = T)
  }
  
  # Define legend labels
  if (is.null(legend.cluster.names)) {
    legend.labels <- paste("Cluster", 1:length(k.cols)) #set default labels
  } else {
    if (length(legend.cluster.names) != length(k.cols)) {
      print("Length of legend.cluster.names must match number of clusters - default labels are used")
      legend.labels <- paste("Cluster", 1:length(k.cols)) #default labels
    }
    legend.labels <- legend.cluster.names #use custom labels
  }
  
  # Set legend box
  if (legend.box) {
    legend_box <- "o"
  } else {
    legend_box <- "n"  
  }
  
  # Add legend
  legend(legend.position, 
         legend = legend.labels,
         pch = 21,
         cex = legend.font.size,
         pt.cex = legend.symbol.size,
         pt.bg = k.cols,
         bty = legend_box)
  
  # Add scale
  scale_position_x = scale.position[1] * (lon_max - lon_min) + lon_min
  scale_position_y = scale.position[2] * (lat_max - lat_min) + lat_min
  maps::map.scale(x = scale_position_x,
                  y = scale_position_y,
                  cex = scale.font.size,
                  relwidth = scale.size,
                  ratio = F)
  
  # Add north arrow
  north_arrow_x = north.arrow.position[1] * (lon_max - lon_min) + lon_min
  north_arrow_y = north.arrow.position[2] * (lat_max - lat_min) + lat_min
  arrows(x0 = north_arrow_x, 
         y0 = north_arrow_y, 
         x1 = north_arrow_x, 
         y1 = north_arrow_y + north.arrow.length, 
         length = 0.13, 
         col = "black", 
         lwd = north.arrow.lwd)
  
  # Add North "N" above north arrow
  text(x = north_arrow_x, 
       y = north_arrow_y + north.arrow.length + north.arrow.N.position, #adjust position for "N"
       labels = "N", 
       cex = north.arrow.N.size, 
       col = "black")
  
  # Close graphics device
  if (save) {
    dev.off()
    message(paste("Plot", ifelse(overwrite, "overwritten to", "saved to"), file.name))
  }
}
Create map with DAPC clusters
## Plot map with DAPC assignment probabilities
common_ids <- intersect(rownames(dapc_result_genpop_data$posterior), dataset$ID) #extract shared IDs
dapc_coordinates <- dataset[match(common_ids, dataset$ID), c("Longitude", "Latitude")] #subset and reorder coordinates
dapc_anestry_matrix <- dapc_result_genpop_data$posterior[common_ids, , drop = FALSE]
plot.Map.assig.prob(ancestry_matrix = dapc_anestry_matrix, 
                    Coordinates = dapc_coordinates,
                    legend.cluster.names = colnames(dapc_anestry_matrix),
                    lon.buffer.range = 5,
                    lat.buffer.range = 5,
                    legend.position = "bottomright",
                    save = T,
                    type = "svg",
                    file.name = "DAPC_map.svg",
                    pie.size = 2.3,
                    pie.col.pal = cols_pop_DAPC,
                    overwrite = T,
                    width = 30,
                    height = 20,
                    north.arrow.length = 1.8,
                    legend.font.size = 1, #font size of legend text
                    north.arrow.N.position = 0.7,
                    legend.symbol.size = 1.5,
                    north.arrow.position = c(0.99, 0.35))
## Plot overwritten to DAPC_map.svg

B) DAPC with grouping identified via k-means

Identify prior grouping via k-means clustering and BIC

Leave out “n.clust” option in find.clusters to perform kmeans clustering and search for best k using BIC (lowest BIC is best K; if several K with similar low BIC, choose first one representing bend in BIC curve). Here I fixed k = 5 using “n.clust” based on a prior search (not shown).

clusters_assignment_genpop_data_DAPC <- as.data.frame(clusters_kmeans_genpop_data_DAPC$grp) #assignment of each individual to the chosen number of clusters
colnames(clusters_assignment_genpop_data_DAPC)[1] <- "Cluster_ID" #rename first column
clusters_kmeans_genpop_data_DAPC$size #number of individuals assigned to each Cluster_assignment
## [1] 64 18 11 16  9
Replace NA values with column means and format genotype matrix
genpop_data_DAPC@tab <- matrix(
  as.integer(round(apply(genpop_data_DAPC@tab, 2, function(x) {
    x[is.na(x)] <- mean(x, na.rm = T) #replace NA with column means
    return(x)}))), #return modified column
  nrow = nrow(genpop_data_DAPC@tab), #number of rows
  ncol = ncol(genpop_data_DAPC@tab), #number of columns
  dimnames = dimnames(genpop_data_DAPC@tab)) #preserve original dimensions
Perform DAPC using k-clusters as prior grouping
dapc_result_genpop_data_kmeans <- dapc(genpop_data_DAPC, pop = clusters_assignment_genpop_data_DAPC$Cluster_ID,
                                       n.pca = as.numeric(optimal_pcs_genpop_data_DAPC_kmeans_run2), #optimal number of PCs from cross-validation
                                       n.da = length(unique(clusters_assignment_genpop_data_DAPC$Cluster_ID)) - 1) #number of discriminant functions (clusters - 1)
Save results
save(xval_results_genpop_data_DAPC_kmeans_run1, optimal_pcs_genpop_data_DAPC_kmeans_run1, 
     xval_results_genpop_data_DAPC_kmeans_run2, optimal_pcs_genpop_data_DAPC_kmeans_run2,
     dapc_result_genpop_data_kmeans,
     file = "DAPC_results_genpop_data_kmeans.Rdata")
load("DAPC_results_genpop_data_kmeans.Rdata")
Evaluate DAPC results
print(round(dapc_result_genpop_data_kmeans$var, 2) * 100) #conserved variance in percent
## [1] 80
par(mar = c(5, 5, 3, 3))
barplot(dapc_result_genpop_data_kmeans$eig, names.arg = paste0("LD", seq_along(dapc_result_genpop_data_kmeans$eig)), #plot eigenvalues of discriminant functions 
        xlab = "Linear discriminants", ylab = "Eigenvalues")

round(dapc_result_genpop_data_kmeans$eig, 1) #show eigenvalues of discriminant functions
## [1] 6234.2 5672.2 3588.7 2535.2
dapc_kmeans_relative_LDs <- round(dapc_result_genpop_data_kmeans$eig / sum(dapc_result_genpop_data_kmeans$eig) * 100, 1)
barplot(dapc_kmeans_relative_LDs, #plot relative proportions of eigenvalues of discriminant functions
        names.arg = paste0("LD", seq_along(dapc_result_genpop_data_kmeans$eig)), 
        xlab = "Linear discriminants", ylab = "Relative proportion of eigenvalues [%]")

Visualize main DAPC results
cols_pop_DAPC_kmeans <- viridis::viridis(n = length(unique(clusters_assignment_genpop_data_DAPC$Cluster_ID)), begin = 0, end = 1) #define color palette
shapes_pop_DAPC_kmeans <- rep(c(21, 24, 22, 25), length.out = length(unique(clusters_assignment_genpop_data_DAPC$Cluster_ID))) #define shape palette
dapc_kmeans_df_genpop_data <- data.frame(dapc_result_genpop_data_kmeans$ind.coord, group = dapc_result_genpop_data_kmeans$grp) #create dataframe for visualization
LD1_label <- paste("LD1 (", dapc_kmeans_relative_LDs[1], "%)", sep = "") #create axis labels for LD1 showing relative proportions of eigenvalues of discriminant functions in percent
LD2_label <- paste("LD2 (", dapc_kmeans_relative_LDs[2], "%)", sep = "") #create axis labels for LD2 showing relative proportions of eigenvalues of discriminant functions in percent
LD3_label <- paste("LD3 (", dapc_kmeans_relative_LDs[3], "%)", sep = "") #create axis labels for LD3 showing relative proportions of eigenvalues of discriminant functions in percent

svg("DAPC_kmeans_LD1_LD2_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(dapc_kmeans_df_genpop_data, aes(x = LD1, y = LD2)) + #kmeans DAPC scatterplot of LD1 vs LD2
  geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group), color = point_outline_color) +
  scale_shape_manual(values = shapes_pop_DAPC_kmeans) +
  scale_fill_manual(values = cols_pop_DAPC_kmeans) +
  labs(shape = population_assignment_name, fill = population_assignment_name, x = LD1_label, y = LD2_label) +
  theme_classic() +
  guides(override.aes = list(alpha = 1)) +
  coord_equal()
dev.off()

svg("DAPC_kmeans_LD1_LD3_genpop_data.svg", width = plot_width, height = plot_height)
ggplot(dapc_kmeans_df_genpop_data, aes(x = LD1, y = LD3)) + #kmeans DAPC scatterplot of LD1 vs LD3
  geom_point(size = point_size, alpha = point_alpha, aes(shape = group, fill = group), color = point_outline_color) +
  scale_shape_manual(values = shapes_pop_DAPC_kmeans) +
  scale_fill_manual(values = cols_pop_DAPC_kmeans) +
  labs(shape = population_assignment_name, fill = population_assignment_name, x = LD1_label, y = LD3_label) +
  theme_classic() +
  guides(override.aes = list(alpha = 1))
dev.off()

plotly::plot_ly( #3D plot of DAPC_kmeans using LD1-LD3
  data = dapc_kmeans_df_genpop_data,
  x = ~LD1, y = ~LD2, z = ~LD3,
  color = ~group, colors = cols_pop_DAPC_kmeans,
  type = 'scatter3d', mode = 'markers',
  alpha = point_alpha_3D_plot, #marker transparency
  marker = list(size = point_size_3D_plot)) #marker size
Create Structure-like plot showing cluster posterior probabilities for individuals across populations
dapc_kmeans_group_memberships <- as.data.frame(dapc_result_genpop_data_kmeans$posterior) #convert posterior probabilities to data frame
dapc_kmeans_group_memberships$ID <- row.names(dapc_kmeans_group_memberships) #add ID column
dapc_kmeans_group_memberships$Population <- genpop_data$pop[match(dapc_kmeans_group_memberships$ID, indNames(genpop_data))] #match populations to IDs
dapc_kmeans_group_memberships_long <- melt(dapc_kmeans_group_memberships, id.vars = c("ID", "Population"), variable.name = "Cluster", value.name = "Probability") #reshape to long format

svg("DAPC_kmeans_Membership_probabilities.svg", width = 30/2.56, height = 30/2.56)
ggplot(dapc_kmeans_group_memberships_long, aes(x = Probability, y = ID, fill = Cluster)) + #create Structure-like plots 
  geom_bar(stat = "identity") + 
  theme_classic() +
  scale_fill_manual(values = cols_pop_DAPC_kmeans) + #use viridis color palette
  facet_wrap(~ Population, scales = "free_y") + #split y-axis by population
  labs(fill = "Cluster") +
  theme(axis.text.y = element_text(angle = 0, hjust = 1, size = 3.3), axis.text.x = element_blank(), 
        axis.title.y = element_blank(), axis.title.x = element_blank(), panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(), panel.background = element_blank(), plot.background = element_blank())
dev.off()

##### Assess allele differentiation
dapc_kmeans_loading_LD1 <- adegenet::loadingplot(dapc_result_genpop_data_kmeans$var.contr[, 1], lab.jitter = 100, #plot loading for first LD axis 
                                                 thres = quantile(dapc_result_genpop_data_kmeans$var.contr[, 1], 0.99)) #set threshold to 99th percentile (top 1% of loadings)

dapc_kmeans_loading_LD2 <- adegenet::loadingplot(dapc_result_genpop_data_kmeans$var.contr[, 2], lab.jitter = 100, #plot loading for second LD axis 
                                                 thres = quantile(dapc_result_genpop_data_kmeans$var.contr[, 2], 0.99)) #set threshold to 99th percentile (top 1% of loadings)

dapc_kmeans_loading_LD1$var.names #print most differentiated alleles for LD1
##  [1] "95445:20:+.0"   "95445:20:+.1"   "98412:7:-.0"    "98412:7:-.1"   
##  [5] "186168:57:-.0"  "186168:57:-.1"  "361864:6:+.0"   "361864:6:+.1"  
##  [9] "463883:8:-.0"   "463883:8:-.1"   "527307:6:+.0"   "527307:6:+.1"  
## [13] "587353:5:+.0"   "587353:5:+.1"   "651105:43:-.0"  "651105:43:-.1" 
## [17] "771979:45:+.0"  "771979:45:+.1"  "776727:19:-.0"  "776727:19:-.1" 
## [21] "834920:17:-.0"  "834920:17:-.1"  "1057500:18:-.0" "1057500:18:-.1"
## [25] "1190182:13:-.0" "1190182:13:-.1" "1197982:37:-.0" "1197982:37:-.1"
## [29] "1256354:9:-.0"  "1256354:9:-.1"  "1326541:35:-.1" "1326541:35:-.0"
## [33] "1398359:49:-.0" "1398359:49:-.1" "1398440:37:+.0" "1398440:37:+.1"
## [37] "1448846:21:-.0" "1448846:21:-.1" "1478384:6:+.0"  "1478384:6:+.1" 
## [41] "1577083:16:+.0" "1577083:16:+.1" "1644575:15:+.0" "1644575:15:+.1"
## [45] "1648234:11:-.1" "1648234:11:-.0" "1684898:43:+.0" "1684898:43:+.1"
## [49] "1686749:18:-.0" "1686749:18:-.1" "1825465:38:-.0" "1825465:38:-.1"
## [53] "1848422:9:-.0"  "1848422:9:-.1"  "2013619:6:-.0"  "2013619:6:-.1" 
## [57] "2026394:30:-.0" "2026394:30:-.1" "2028920:8:+.0"  "2028920:8:+.1" 
## [61] "2041339:24:+.0" "2041339:24:+.1" "2044034:20:+.1" "2044034:20:+.0"
## [65] "2219166:4:-.0"  "2219166:4:-.1"  "2336237:41:+.0" "2336237:41:+.1"
## [69] "2418655:18:+.0" "2418655:18:+.1" "2425420:38:-.0" "2425420:38:-.1"
## [73] "2527226:5:-.0"  "2527226:5:-.1"  "2716161:8:-.0"  "2716161:8:-.1" 
## [77] "2896503:13:+.0" "2896503:13:+.1" "2914166:4:-.0"  "2914166:4:-.1" 
## [81] "2984726:5:-.0"  "2984726:5:-.1"  "3014454:28:-.0" "3014454:28:-.1"
## [85] "3061381:9:-.0"  "3061381:9:-.1"  "3103167:9:+.1"  "3103167:9:+.0" 
## [89] "3232852:11:-.0" "3232852:11:-.1" "3408274:20:+.0" "3408274:20:+.1"
dapc_kmeans_loading_LD2$var.names #print most differentiated alleles for LD2
##  [1] "209826:27:+.0"  "209826:27:+.1"  "292767:9:-.0"   "292767:9:-.1"  
##  [5] "396591:27:-.0"  "396591:27:-.1"  "686382:4:-.0"   "686382:4:-.1"  
##  [9] "691278:5:-.0"   "691278:5:-.1"   "703009:18:-.0"  "703009:18:-.1" 
## [13] "750141:18:-.0"  "750141:18:-.1"  "759739:26:+.1"  "759739:26:+.0" 
## [17] "761132:51:-.0"  "761132:51:-.1"  "776727:19:-.0"  "776727:19:-.1" 
## [21] "917171:7:+.0"   "917171:7:+.1"   "924947:8:-.0"   "924947:8:-.1"  
## [25] "927164:6:+.0"   "927164:6:+.1"   "959252:10:+.0"  "959252:10:+.1" 
## [29] "1050155:10:-.0" "1050155:10:-.1" "1073232:13:+.0" "1073232:13:+.1"
## [33] "1117117:18:+.0" "1117117:18:+.1" "1135134:12:+.0" "1135134:12:+.1"
## [37] "1261020:19:-.0" "1261020:19:-.1" "1335232:9:-.0"  "1335232:9:-.1" 
## [41] "1549631:7:+.0"  "1549631:7:+.1"  "1666926:22:+.0" "1666926:22:+.1"
## [45] "1697966:7:-.0"  "1697966:7:-.1"  "1739335:15:+.0" "1739335:15:+.1"
## [49] "1789990:10:+.0" "1789990:10:+.1" "1791591:9:-.0"  "1791591:9:-.1" 
## [53] "2004857:55:-.1" "2004857:55:-.0" "2011745:23:-.0" "2011745:23:-.1"
## [57] "2127415:13:+.0" "2127415:13:+.1" "2147586:28:-.0" "2147586:28:-.1"
## [61] "2202785:6:-.0"  "2202785:6:-.1"  "2342825:6:+.0"  "2342825:6:+.1" 
## [65] "2559347:4:-.0"  "2559347:4:-.1"  "2702045:14:+.0" "2702045:14:+.1"
## [69] "2716161:8:-.0"  "2716161:8:-.1"  "2816714:11:-.0" "2816714:11:-.1"
## [73] "2909766:57:-.0" "2909766:57:-.1" "2939340:6:+.0"  "2939340:6:+.1" 
## [77] "3048695:18:-.0" "3048695:18:-.1" "3074851:7:+.0"  "3074851:7:+.1" 
## [81] "3103167:9:+.1"  "3103167:9:+.0"  "3134272:9:+.0"  "3134272:9:+.1" 
## [85] "3177228:11:+.0" "3177228:11:+.1" "3278150:15:+.0" "3278150:15:+.1"
## [89] "3322807:5:-.0"  "3322807:5:-.1"  "3329625:9:+.0"  "3329625:9:+.1"
intersect(dapc_kmeans_loading_LD1$var.names, dapc_kmeans_loading_LD2$var.names) #show differentiated alleles for both LD1 and LD2
## [1] "776727:19:-.0" "776727:19:-.1" "2716161:8:-.0" "2716161:8:-.1"
## [5] "3103167:9:+.1" "3103167:9:+.0"
Assess and visualize group memberships
dapc_kmeans_group_memberships <- as.data.frame(dapc_result_genpop_data_kmeans$posterior) #convert posterior probabilities to data frame
subset_size <- 40 #define subset size
plot_assignments <- function(dapc_kmeans_result, subset_size, max_rows) { #plot group memberships (heat colors represent membership probabilities with red=1 and white=0, blue crosses represent prior Cluster_assignment provided to DAPC kmeans)
  num_plots <- ceiling(max_rows / subset_size) #calculate number of plots needed
  for (i in 0:(num_plots - 1)) {
    start_row <- i * subset_size + 1 #start index for subset
    end_row <- min((i + 1) * subset_size, max_rows) #end index for subset
    cat("Plotting rows", start_row, "to", end_row, "\n") #print status
    adegenet::assignplot(dapc_kmeans_result, subset = start_row:end_row)}} #plot assignments for current subset
par(mar = c(5, 11, 3, 3))
plot_assignments(dapc_result_genpop_data_kmeans, subset_size, nrow(dapc_kmeans_group_memberships)) #plot all subsets
## Plotting rows 1 to 40

## Plotting rows 41 to 80

## Plotting rows 81 to 118

Identify and plot admixed individuals
dapc_result_genpop_data_kmeans$grp <- factor(dapc_result_genpop_data_kmeans$grp, levels = unique(dapc_result_genpop_data_kmeans$grp))
admixed_individuals <- which(apply(dapc_result_genpop_data_kmeans$posterior, 1, function(e) all(e < 0.80))) #identify admixed individuals (posterior probabilities less than 0.80)
admixed_individuals #show admixed individuals
## named integer(0)
if (length(admixed_individuals) > 0) {adegenet::compoplot(dapc_result_genpop_data_kmeans, col.pal = cols_pop_DAPC_kmeans, subset = admixed_individuals)} #plot admixed individuals
Create map with DAPC clusters
## Plot map with DAPC assignment probabilities
common_ids <- intersect(rownames(dapc_result_genpop_data_kmeans$posterior), dataset$ID) #extract shared IDs
head(common_ids)
dapc_coordinates_kmeans <- dataset[match(common_ids, dataset$ID), c("Longitude", "Latitude")] #subset and reorder coordinates
head(dapc_coordinates_kmeans)
dapc_anestry_matrix_kmeans <- dapc_result_genpop_data_kmeans$posterior[common_ids, , drop = FALSE]
head(dapc_anestry_matrix_kmeans)
cols_pop_DAPC_kmeans
plot.Map.assig.prob(ancestry_matrix = dapc_anestry_matrix_kmeans, 
                    Coordinates = dapc_coordinates_kmeans,
                    legend.cluster.names = colnames(dapc_anestry_matrix_kmeans),
                    lon.buffer.range = 2.5,
                    lat.buffer.range = 2.5,
                    legend.position = "bottomright",
                    save = T,
                    type = "svg",
                    file.name = "DAPC_map_kmeans.svg",
                    pie.size = 2.5,
                    pie.col.pal = cols_pop_DAPC_kmeans,
                    overwrite = T,
                    width = 30,
                    height = 20,
                    north.arrow.length = 1.8,
                    legend.font.size = 1, #font size of legend text
                    north.arrow.N.position = 0.7,
                    legend.symbol.size = 1.5,
                    north.arrow.position = c(0.992, 0.35))
## Plot overwritten to DAPC_map_kmeans.svg